rdrpcatch 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdrpcatch/__init__.py +0 -0
- rdrpcatch/cli/__init__.py +0 -0
- rdrpcatch/cli/args.py +358 -0
- rdrpcatch/rdrpcatch_scripts/__init__.py +0 -0
- rdrpcatch/rdrpcatch_scripts/fetch_dbs.py +302 -0
- rdrpcatch/rdrpcatch_scripts/format_pyhmmer_out.py +589 -0
- rdrpcatch/rdrpcatch_scripts/gui.py +256 -0
- rdrpcatch/rdrpcatch_scripts/mmseqs_tax.py +100 -0
- rdrpcatch/rdrpcatch_scripts/paths.py +162 -0
- rdrpcatch/rdrpcatch_scripts/plot.py +165 -0
- rdrpcatch/rdrpcatch_scripts/run_pyhmmer.py +155 -0
- rdrpcatch/rdrpcatch_scripts/run_seqkit.py +112 -0
- rdrpcatch/rdrpcatch_scripts/utils.py +414 -0
- rdrpcatch/rdrpcatch_wrapper.py +666 -0
- rdrpcatch-0.0.1.dist-info/METADATA +223 -0
- rdrpcatch-0.0.1.dist-info/RECORD +19 -0
- rdrpcatch-0.0.1.dist-info/WHEEL +4 -0
- rdrpcatch-0.0.1.dist-info/entry_points.txt +2 -0
- rdrpcatch-0.0.1.dist-info/licenses/LICENCE +9 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def calculate_true_coverage(starts: list, ends: list) -> int:
|
|
8
|
+
"""Optimized coverage calculation using interval merging
|
|
9
|
+
|
|
10
|
+
:param starts: List of start positions
|
|
11
|
+
:type starts: list
|
|
12
|
+
:param ends: List of end positions
|
|
13
|
+
:type ends: list
|
|
14
|
+
:return: Total coverage
|
|
15
|
+
:rtype: int
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
if not starts:
|
|
19
|
+
return 0
|
|
20
|
+
|
|
21
|
+
intervals = sorted(zip(starts, ends))
|
|
22
|
+
merged = []
|
|
23
|
+
current_start, current_end = intervals[0]
|
|
24
|
+
|
|
25
|
+
for start, end in intervals[1:]:
|
|
26
|
+
if start <= current_end + 1: # Handle adjacent ranges
|
|
27
|
+
current_end = max(current_end, end)
|
|
28
|
+
else:
|
|
29
|
+
merged.append((current_start, current_end))
|
|
30
|
+
current_start, current_end = start, end
|
|
31
|
+
|
|
32
|
+
merged.append((current_start, current_end))
|
|
33
|
+
return sum(end - start + 1 for start, end in merged)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class hmmsearch_formatter:
|
|
37
|
+
"""
|
|
38
|
+
Class for parsing hmmsearch output files.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
data (dict): A dictionary containing the parsed data from the hmmscan output file.
|
|
42
|
+
hmm_output_file (str): Path to the hmmscan output file.
|
|
43
|
+
|
|
44
|
+
Methods:
|
|
45
|
+
parse_output(hmm_output_file): Parses the hmmsearch output file and returns a dictionary.
|
|
46
|
+
calculate_coverage(data): Calculates the coverage of all domains in a profile.
|
|
47
|
+
get_contig(contig_name): Returns all profiles and domains for a given contig.
|
|
48
|
+
export_processed_file(data, outfile, p_cov_threshold=0): Exports the processed hmmscan output file.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, hmm_raw, hmm_processed, seq_type):
|
|
52
|
+
"""
|
|
53
|
+
Constructor for the hmmsearch_parser class.
|
|
54
|
+
|
|
55
|
+
:param hmm_raw: Path to the raw hmmsearch output file.
|
|
56
|
+
:type hmm_raw: str
|
|
57
|
+
:param hmm_processed: Path to the processed output file.
|
|
58
|
+
:type hmm_processed: str
|
|
59
|
+
|
|
60
|
+
If PROTEIN: contig name is the first column
|
|
61
|
+
If DNA: contig name is the last column, first column is the translated sequence name (e.g. contig_name_frame)
|
|
62
|
+
"""
|
|
63
|
+
self.hmm_output_file = hmm_raw
|
|
64
|
+
hmm_custom = str(hmm_raw.with_suffix('.custom.tsv'))
|
|
65
|
+
|
|
66
|
+
# Parse and process the data using Polars DataFrame operations
|
|
67
|
+
data_df = pl.read_csv(hmm_custom, separator='\t')
|
|
68
|
+
# Check if the dataframe is empty
|
|
69
|
+
if data_df.is_empty():
|
|
70
|
+
title_line= ['Contig_name', 'Translated_contig_name (frame)', 'Sequence_length(AA)', 'Profile_name',
|
|
71
|
+
'Profile_length', 'E-value', 'score', 'acc', 'norm_bitscore_profile',
|
|
72
|
+
'norm_bitscore_contig', 'ID_score', 'RdRp_from(AA)', 'RdRp_to(AA)', 'profile_coverage',
|
|
73
|
+
'contig_coverage']
|
|
74
|
+
data_df = pl.DataFrame({col: [] for col in title_line})
|
|
75
|
+
data_df.write_csv(hmm_processed, separator="\t")
|
|
76
|
+
else:
|
|
77
|
+
data_df = self.calculate_norm_bitscore_profile(data_df)
|
|
78
|
+
data_df = self.calculate_norm_bitscore_contig(data_df)
|
|
79
|
+
data_df = self.calculate_coverage_stats(data_df)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if seq_type == 'prot':
|
|
83
|
+
self.export_processed_file_aa(data_df, hmm_processed)
|
|
84
|
+
elif seq_type == 'nuc':
|
|
85
|
+
self.export_processed_file_dna(data_df, hmm_processed)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def calculate_norm_bitscore_profile(self, data_df):
|
|
89
|
+
"""
|
|
90
|
+
Calculates the normalized bitscore for each profile.
|
|
91
|
+
|
|
92
|
+
:param data_df: Dictionary containing the parsed data.
|
|
93
|
+
:type data: dict
|
|
94
|
+
:return: Dictionary containing the parsed data with normalized bitscores.
|
|
95
|
+
:rtype: dataframe
|
|
96
|
+
"""
|
|
97
|
+
data_df = (data_df.with_columns([
|
|
98
|
+
# Normalized bitscores
|
|
99
|
+
(pl.col('score') / pl.col('qlen')).alias('norm_bitscore_profile')]))
|
|
100
|
+
return data_df
|
|
101
|
+
|
|
102
|
+
def calculate_norm_bitscore_contig(self, data_df):
|
|
103
|
+
"""
|
|
104
|
+
Calculates the normalized bitscore for each contig.
|
|
105
|
+
|
|
106
|
+
:param data_df: Dictionary containing the parsed data.
|
|
107
|
+
:type data: dict
|
|
108
|
+
:return: Dictionary containing the parsed data with normalized bitscores.
|
|
109
|
+
:rtype: dataframe
|
|
110
|
+
"""
|
|
111
|
+
data_df = (data_df.with_columns([
|
|
112
|
+
# Normalized bitscores
|
|
113
|
+
(pl.col('score') / pl.col('tlen')).alias('norm_bitscore_contig')]))
|
|
114
|
+
return data_df
|
|
115
|
+
|
|
116
|
+
def calculate_coverage_stats(self, data_df):
|
|
117
|
+
"""
|
|
118
|
+
Calculates the coverage statistics for each profile.
|
|
119
|
+
|
|
120
|
+
:param data: Dictionary containing the parsed data.
|
|
121
|
+
:type data: dict
|
|
122
|
+
:return: Dictionary containing the parsed data with coverage statistics.
|
|
123
|
+
:rtype: dict
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
df = data_df.with_columns(
|
|
127
|
+
pl.col("env_from").cast(pl.Int64),
|
|
128
|
+
pl.col("env_to").cast(pl.Int64),
|
|
129
|
+
pl.col("hmm_from").cast(pl.Int64),
|
|
130
|
+
pl.col("hmm_to").cast(pl.Int64),
|
|
131
|
+
pl.col("ali_from").cast(pl.Int64),
|
|
132
|
+
pl.col("ali_to").cast(pl.Int64)
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
stats_df = (
|
|
136
|
+
df
|
|
137
|
+
.with_row_index("row_id")
|
|
138
|
+
.join(
|
|
139
|
+
df.group_by(["t_name", "q_name"])
|
|
140
|
+
.agg(
|
|
141
|
+
pl.col("env_from").alias("starts"),
|
|
142
|
+
pl.col("env_to").alias("ends"),
|
|
143
|
+
pl.col("hmm_from").alias("hmm_starts"),
|
|
144
|
+
pl.col("hmm_to").alias("hmm_ends"),
|
|
145
|
+
pl.col("ali_from").alias("ali_starts"),
|
|
146
|
+
pl.col("ali_to").alias("ali_ends"),
|
|
147
|
+
pl.col("tlen").first().alias("tlen"),
|
|
148
|
+
pl.col("qlen").first().alias("qlen"),
|
|
149
|
+
pl.col("score").first().alias("score"),
|
|
150
|
+
pl.col("env_from").min().alias("RdRp_start"),
|
|
151
|
+
pl.col("env_to").max().alias("RdRp_end"),
|
|
152
|
+
pl.len().alias("row_count")
|
|
153
|
+
)
|
|
154
|
+
.with_columns(
|
|
155
|
+
contig_coverage=pl.when(pl.col("row_count") == 1)
|
|
156
|
+
.then(pl.col("ends").list.first() - pl.col("starts").list.first() + 1)
|
|
157
|
+
.otherwise(
|
|
158
|
+
pl.struct(["starts", "ends"])
|
|
159
|
+
.map_elements(lambda x: calculate_true_coverage(x["starts"], x["ends"]),return_dtype=pl.Int64)
|
|
160
|
+
),
|
|
161
|
+
profile_coverage=pl.when(pl.col("row_count") == 1)
|
|
162
|
+
.then(pl.col("hmm_ends").list.first() - pl.col("hmm_starts").list.first() + 1)
|
|
163
|
+
.otherwise(
|
|
164
|
+
pl.struct(["hmm_starts", "hmm_ends"])
|
|
165
|
+
.map_elements(lambda x: calculate_true_coverage(x["hmm_starts"], x["hmm_ends"]),return_dtype=pl.Int64)
|
|
166
|
+
),
|
|
167
|
+
aligned_coverage=pl.when(pl.col("row_count") == 1)
|
|
168
|
+
.then(pl.col("ali_ends").list.first() - pl.col("ali_starts").list.first() + 1)
|
|
169
|
+
.otherwise(
|
|
170
|
+
pl.struct(["ali_starts", "ali_ends"])
|
|
171
|
+
.map_elements(lambda x: calculate_true_coverage(x["ali_starts"], x["ali_ends"]),return_dtype=pl.Int64)
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
.with_columns(
|
|
175
|
+
contig_coverage=(pl.col("contig_coverage") / pl.col("tlen")).alias("contig_coverage"),
|
|
176
|
+
profile_coverage=(pl.col("profile_coverage") / pl.col("qlen")).alias("profile_coverage"),
|
|
177
|
+
ID_score=(pl.col("score") / pl.col("aligned_coverage")).alias("ID_score")
|
|
178
|
+
)
|
|
179
|
+
.select(
|
|
180
|
+
["t_name", "q_name", "contig_coverage", "profile_coverage", "ID_score", "RdRp_start", "RdRp_end"]),
|
|
181
|
+
on=["t_name", "q_name"]
|
|
182
|
+
)
|
|
183
|
+
.sort("row_id")
|
|
184
|
+
.drop("row_id")
|
|
185
|
+
)
|
|
186
|
+
# Group by contig and profile name, keep the first occurrence of all columns
|
|
187
|
+
stats_df = (
|
|
188
|
+
stats_df
|
|
189
|
+
.group_by(["t_name", "q_name"])
|
|
190
|
+
.agg(
|
|
191
|
+
pl.col("*").first() # Keep the first occurrence of all columns
|
|
192
|
+
)
|
|
193
|
+
.sort(["t_name", "q_name"])
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return stats_df
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def export_processed_file_aa(self, data_df, outfile):
|
|
200
|
+
"""
|
|
201
|
+
Exports the processed hmmsearch output file for protein sequences.
|
|
202
|
+
|
|
203
|
+
:param data_df: Polars DataFrame containing the parsed data.
|
|
204
|
+
:type data_df: pl.DataFrame
|
|
205
|
+
:param outfile: Path to the output file.
|
|
206
|
+
:type outfile: str
|
|
207
|
+
:return: None
|
|
208
|
+
"""
|
|
209
|
+
# Select and rename columns for output
|
|
210
|
+
output_df = data_df.select([
|
|
211
|
+
pl.col('t_name').alias('Contig_name'),
|
|
212
|
+
pl.lit("-").alias('Translated_contig_name (frame)'),
|
|
213
|
+
pl.col('tlen').alias('Sequence_length(AA)'),
|
|
214
|
+
pl.col('q_name').alias('Profile_name'),
|
|
215
|
+
pl.col('qlen').alias('Profile_length'),
|
|
216
|
+
pl.col('E-value'),
|
|
217
|
+
pl.col('score'),
|
|
218
|
+
# pl.col("acc").alias("hmm_accuracy"),
|
|
219
|
+
pl.col('norm_bitscore_profile'),
|
|
220
|
+
pl.col('norm_bitscore_contig'),
|
|
221
|
+
pl.col('ID_score'),
|
|
222
|
+
pl.col('RdRp_start').alias('RdRp_from(AA)'),
|
|
223
|
+
pl.col('RdRp_end').alias('RdRp_to(AA)'),
|
|
224
|
+
pl.col('profile_coverage'),
|
|
225
|
+
pl.col('contig_coverage')
|
|
226
|
+
])
|
|
227
|
+
|
|
228
|
+
output_df.write_csv(outfile, separator="\t")
|
|
229
|
+
|
|
230
|
+
def export_processed_file_dna(self, data_df, outfile):
|
|
231
|
+
"""
|
|
232
|
+
Exports the processed hmmsearch output file for DNA sequences.
|
|
233
|
+
|
|
234
|
+
:param data_df: Polars DataFrame containing the parsed data.
|
|
235
|
+
:type data_df: pl.DataFrame
|
|
236
|
+
:param outfile: Path to the output file.
|
|
237
|
+
:type outfile: str
|
|
238
|
+
:return: None
|
|
239
|
+
"""
|
|
240
|
+
# Extract contig name and frame from translated sequence name
|
|
241
|
+
output_df = (data_df
|
|
242
|
+
.with_columns([
|
|
243
|
+
pl.col('t_name').str.extract(r'(.+)_frame=[-]?\d').alias('Contig_name'),
|
|
244
|
+
pl.col('t_name').alias('Translated_contig_name (frame)')
|
|
245
|
+
])
|
|
246
|
+
.select([
|
|
247
|
+
pl.col('Contig_name'),
|
|
248
|
+
pl.col('Translated_contig_name (frame)'),
|
|
249
|
+
pl.col('tlen').alias('Sequence_length(AA)'),
|
|
250
|
+
pl.col('q_name').alias('Profile_name'),
|
|
251
|
+
pl.col('qlen').alias('Profile_length'),
|
|
252
|
+
pl.col('E-value'),
|
|
253
|
+
pl.col('score'),
|
|
254
|
+
# pl.col("acc").alias("hmm_accuracy"),
|
|
255
|
+
pl.col('norm_bitscore_profile'),
|
|
256
|
+
pl.col('norm_bitscore_contig'),
|
|
257
|
+
pl.col('ID_score'),
|
|
258
|
+
pl.col('RdRp_start').alias('RdRp_from(AA)'),
|
|
259
|
+
pl.col('RdRp_end').alias('RdRp_to(AA)'),
|
|
260
|
+
pl.col('profile_coverage'),
|
|
261
|
+
pl.col('contig_coverage')
|
|
262
|
+
]))
|
|
263
|
+
output_df.write_csv(outfile, separator="\t")
|
|
264
|
+
|
|
265
|
+
class hmmsearch_format_helpers:
|
|
266
|
+
|
|
267
|
+
def __init__(self, hmm_outfn, seq_type, logger=None):
|
|
268
|
+
self.hmm_outfn = hmm_outfn
|
|
269
|
+
self.seq_type = seq_type
|
|
270
|
+
self.logger = logger
|
|
271
|
+
|
|
272
|
+
def hmm_to_contig_set(self):
|
|
273
|
+
"""
|
|
274
|
+
Returns a set of all contig names in the data.
|
|
275
|
+
|
|
276
|
+
:return: Set of contig names.
|
|
277
|
+
:rtype: set
|
|
278
|
+
"""
|
|
279
|
+
df = pl.read_csv(self.hmm_outfn, separator='\t')
|
|
280
|
+
if self.seq_type == 'nuc':
|
|
281
|
+
result = set(df['Contig_name'].unique())
|
|
282
|
+
elif self.seq_type == 'prot':
|
|
283
|
+
result = set(df['Translated_contig_name (frame)'].unique())
|
|
284
|
+
if self.logger:
|
|
285
|
+
self.logger.silent_log(f"Found {len(result)} unique contigs")
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
def highest_bitscore_hits(self, filtered_file):
|
|
289
|
+
"""
|
|
290
|
+
Filters the hmmsearch output file based on the highest bitscore for each contig.
|
|
291
|
+
|
|
292
|
+
:param filtered_file: Path to the filtered output file.
|
|
293
|
+
:type filtered_file: str
|
|
294
|
+
:return: None
|
|
295
|
+
"""
|
|
296
|
+
df = pl.read_csv(self.hmm_outfn, separator='\t')
|
|
297
|
+
if self.logger:
|
|
298
|
+
self.logger.silent_log(f"Processing {len(df)} hits for highest bitscore")
|
|
299
|
+
|
|
300
|
+
# Get total hits per contig
|
|
301
|
+
hit_counts = df.group_by('Contig_name').agg(
|
|
302
|
+
pl.count().alias('Total_positive_profiles')
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Get best hits by score
|
|
306
|
+
best_hits = df.join(hit_counts, on='Contig_name').sort('score', descending=True).group_by('Contig_name').first()
|
|
307
|
+
|
|
308
|
+
if self.logger:
|
|
309
|
+
self.logger.silent_log(f"Found {len(best_hits)} best hits")
|
|
310
|
+
|
|
311
|
+
best_hits.write_csv(filtered_file, separator='\t')
|
|
312
|
+
|
|
313
|
+
def highest_norm_bit_prof_hits(self, filtered_file):
|
|
314
|
+
"""
|
|
315
|
+
Filters the hmmsearch output file based on the highest normalized bitscore for each contig.
|
|
316
|
+
|
|
317
|
+
:param filtered_file: Path to the filtered output file.
|
|
318
|
+
:type filtered_file: str
|
|
319
|
+
:return: None
|
|
320
|
+
"""
|
|
321
|
+
df = pl.read_csv(self.hmm_outfn, separator='\t')
|
|
322
|
+
if self.logger:
|
|
323
|
+
self.logger.silent_log(f"Processing {len(df)} hits for highest normalized bitscore")
|
|
324
|
+
|
|
325
|
+
# Get best hits by normalized bitscore
|
|
326
|
+
best_hits = df.sort('norm_bitscore_profile', descending=True).group_by('Contig_name').first()
|
|
327
|
+
|
|
328
|
+
if self.logger:
|
|
329
|
+
self.logger.silent_log(f"Found {len(best_hits)} best hits")
|
|
330
|
+
|
|
331
|
+
best_hits.write_csv(filtered_file, separator='\t')
|
|
332
|
+
|
|
333
|
+
def lowest_evalue_hits(self, filtered_file):
|
|
334
|
+
"""
|
|
335
|
+
Filters the hmmsearch output file based on the lowest E-value for each contig.
|
|
336
|
+
|
|
337
|
+
:param filtered_file: Path to the filtered output file.
|
|
338
|
+
:type filtered_file: str
|
|
339
|
+
:return: None
|
|
340
|
+
"""
|
|
341
|
+
df = pl.read_csv(self.hmm_outfn, separator='\t')
|
|
342
|
+
if self.logger:
|
|
343
|
+
self.logger.silent_log(f"Processing {len(df)} hits for lowest E-value")
|
|
344
|
+
|
|
345
|
+
# Get best hits by lowest E-value
|
|
346
|
+
best_hits = df.sort('E-value').group_by('Contig_name').first()
|
|
347
|
+
|
|
348
|
+
if self.logger:
|
|
349
|
+
self.logger.silent_log(f"Found {len(best_hits)} best hits")
|
|
350
|
+
|
|
351
|
+
best_hits.write_csv(filtered_file, separator='\t')
|
|
352
|
+
|
|
353
|
+
def extract_col(self, index):
|
|
354
|
+
"""
|
|
355
|
+
Extracts a column from the hmmsearch output file based on index.
|
|
356
|
+
|
|
357
|
+
:param index: Index of the column to extract.
|
|
358
|
+
:type index: int
|
|
359
|
+
:return: List of values from the specified column.
|
|
360
|
+
:rtype: list
|
|
361
|
+
"""
|
|
362
|
+
df = pl.read_csv(self.hmm_outfn, separator='\t')
|
|
363
|
+
return df.select(df.columns[index]).to_series().to_list()
|
|
364
|
+
|
|
365
|
+
class hmmsearch_output_writter:
|
|
366
|
+
|
|
367
|
+
def __init__(self, logger=None):
|
|
368
|
+
"""
|
|
369
|
+
Constructor for the hmmsearch_output_writter class.
|
|
370
|
+
|
|
371
|
+
:param logger: Logger instance for output
|
|
372
|
+
:type logger: utils.Logger
|
|
373
|
+
"""
|
|
374
|
+
self.logger = logger
|
|
375
|
+
|
|
376
|
+
def write_hmmsearch_hits(self, hmmsearch_out_file, seq_type, rdrpcatch_out, gff_out):
|
|
377
|
+
"""
|
|
378
|
+
Writes the hmmsearch hits to a GFF file.
|
|
379
|
+
|
|
380
|
+
:param hmmsearch_out_file: Path to the hmmsearch output file.
|
|
381
|
+
:type hmmsearch_out_file: str
|
|
382
|
+
:param seq_type: Type of sequence (prot or nuc).
|
|
383
|
+
:type seq_type: str
|
|
384
|
+
:param rdrpcatch_out: Path to the RdRpCATCH output file.
|
|
385
|
+
:type rdrpcatch_out: str
|
|
386
|
+
:param gff_out: Path to the GFF output file.
|
|
387
|
+
:type gff_out: str
|
|
388
|
+
:return: None
|
|
389
|
+
"""
|
|
390
|
+
from .utils import write_combined_results_to_gff, convert_record_to_gff3_record
|
|
391
|
+
|
|
392
|
+
df = pl.read_csv(hmmsearch_out_file, separator='\t')
|
|
393
|
+
|
|
394
|
+
grouped = df.group_by("Contig_name").agg(
|
|
395
|
+
pl.concat_str(
|
|
396
|
+
[
|
|
397
|
+
pl.col("db_name"),
|
|
398
|
+
pl.col("Total_positive_profiles").cast(str)
|
|
399
|
+
],
|
|
400
|
+
separator="="
|
|
401
|
+
).str.join(";").alias("Total_databases_that_the_contig_was_detected(No_of_Profiles)")
|
|
402
|
+
)
|
|
403
|
+
# Group by contig name and get the max score
|
|
404
|
+
max_scores = df.group_by("Contig_name").agg(pl.max("score"))
|
|
405
|
+
# Join the max scores and the grouped columns
|
|
406
|
+
result_df = df.join(max_scores, on=["Contig_name", "score"]).join(grouped, on="Contig_name")
|
|
407
|
+
# Drop the Total_positive_profiles column
|
|
408
|
+
result_df = result_df.unique("Contig_name").drop("Total_positive_profiles")
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
# Rename the columns
|
|
412
|
+
result_df = result_df.with_columns(pl.col("db_name").alias("Best_hit_Database"))
|
|
413
|
+
result_df = result_df.with_columns(pl.col("Profile_name").alias("Best_hit_profile_name"))
|
|
414
|
+
result_df = result_df.with_columns(pl.col("Profile_length").alias("Best_hit_profile_length"))
|
|
415
|
+
result_df = result_df.with_columns(pl.col("E-value").alias("Best_hit_e-value"))
|
|
416
|
+
result_df = result_df.with_columns(pl.col("score").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_bitscore"))
|
|
417
|
+
result_df = result_df.with_columns(pl.col("profile_coverage").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_profile_coverage"))
|
|
418
|
+
result_df = result_df.with_columns(pl.col("contig_coverage").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_contig_coverage"))
|
|
419
|
+
result_df = result_df.with_columns(pl.col("norm_bitscore_profile").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_norm_bitscore_profile"))
|
|
420
|
+
result_df = result_df.with_columns(pl.col("norm_bitscore_contig").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_norm_bitscore_contig"))
|
|
421
|
+
result_df = result_df.with_columns(pl.col("ID_score").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_ID_score"))
|
|
422
|
+
|
|
423
|
+
# Reorder the columns
|
|
424
|
+
column_order = ["Contig_name", "Translated_contig_name (frame)",
|
|
425
|
+
"Sequence_length(AA)", "Total_databases_that_the_contig_was_detected(No_of_Profiles)",
|
|
426
|
+
"Best_hit_Database", "Best_hit_profile_name", "Best_hit_profile_length", "Best_hit_e-value",
|
|
427
|
+
"Best_hit_bitscore", "RdRp_from(AA)", "RdRp_to(AA)", "Best_hit_profile_coverage",
|
|
428
|
+
"Best_hit_contig_coverage", "Best_hit_norm_bitscore_profile", "Best_hit_norm_bitscore_contig",
|
|
429
|
+
"Best_hit_ID_score"]
|
|
430
|
+
|
|
431
|
+
result_df = result_df.select(column_order)
|
|
432
|
+
|
|
433
|
+
# Write the RdRpCATCH output file first
|
|
434
|
+
result_df.write_csv(rdrpcatch_out, separator='\t')
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
# Create GFF format with attributes as a struct
|
|
438
|
+
write_combined_results_to_gff(gff_out, result_df,seq_type)
|
|
439
|
+
# print(df.columns)
|
|
440
|
+
# gff_df = df.with_columns([
|
|
441
|
+
# pl.col('Contig_name'),
|
|
442
|
+
# pl.col('db_name').alias('source'),
|
|
443
|
+
# pl.lit('protein_match').alias('type'),
|
|
444
|
+
# pl.col('RdRp_from(AA)'),
|
|
445
|
+
# pl.col('RdRp_to(AA)'),
|
|
446
|
+
# pl.col('score'),
|
|
447
|
+
# pl.lit('+').alias('strand'),
|
|
448
|
+
# pl.lit('.').alias('phase')])
|
|
449
|
+
# print(gff_df)
|
|
450
|
+
# print(gff_df.columns)
|
|
451
|
+
# with open(gff_out, 'w') as out_handle:
|
|
452
|
+
# out_handle.write('##gff-version 3\n')
|
|
453
|
+
# for row in gff_df.iter_rows(named=True):
|
|
454
|
+
# # print(row)
|
|
455
|
+
# # print(row['Contig_name'])
|
|
456
|
+
# gff_line = "\t".join(
|
|
457
|
+
# [row['Contig_name'],
|
|
458
|
+
# row['source'],
|
|
459
|
+
# row['type'],
|
|
460
|
+
# row['RdRp_from(AA)'],
|
|
461
|
+
# row['RdRp_to(AA)'],
|
|
462
|
+
# row['score'],
|
|
463
|
+
# row['strand'],
|
|
464
|
+
# row['phase'],
|
|
465
|
+
# row['attributes']])
|
|
466
|
+
# out_handle.write(f"{gff_line}\n")
|
|
467
|
+
# gff_df = gff_df.with_columns([
|
|
468
|
+
# pl.struct([
|
|
469
|
+
# pl.col('Contig_name'),
|
|
470
|
+
# pl.col('Profile_name'),
|
|
471
|
+
# pl.col('E-value').cast(pl.Utf8),
|
|
472
|
+
# pl.col('score').cast(pl.Utf8),
|
|
473
|
+
# pl.col('profile_coverage').cast(pl.Utf8),
|
|
474
|
+
# pl.col('contig_coverage').cast(pl.Utf8),
|
|
475
|
+
# pl.col('ID_score').cast(pl.Utf8)
|
|
476
|
+
# ]).map_elements(lambda x: f"ID=RdRp_{x[0]};Profile={x[1]};E-value={x[2]};score={x[3]};profile_coverage={x[4]};contig_coverage={x[5]};ID_score={x[6]}").alias('attributes')
|
|
477
|
+
# ])
|
|
478
|
+
|
|
479
|
+
# # Write GFF file
|
|
480
|
+
# with open(gff_out, 'w') as out_handle:
|
|
481
|
+
# out_handle.write('##gff-version 3\n')
|
|
482
|
+
# gff_df.write_csv(out_handle, separator='\t', has_header=False)
|
|
483
|
+
|
|
484
|
+
def get_rdrp_coords(self, rdrpcatch_out, seq_type):
|
|
485
|
+
"""
|
|
486
|
+
Gets the RdRp coordinates from the RdRpCATCH output file.
|
|
487
|
+
|
|
488
|
+
:param rdrpcatch_out: Path to the RdRpCATCH output file.
|
|
489
|
+
:type rdrpcatch_out: str
|
|
490
|
+
:return: List of tuples containing contig name and RdRp coordinates.
|
|
491
|
+
:rtype: list
|
|
492
|
+
"""
|
|
493
|
+
# Convert the path to use combined.tsv instead of rdrpcatch_output.tsv
|
|
494
|
+
combined_file = str(Path(rdrpcatch_out).parent / Path(rdrpcatch_out))
|
|
495
|
+
if self.logger:
|
|
496
|
+
self.logger.silent_log(f"Reading coordinates from {combined_file}")
|
|
497
|
+
|
|
498
|
+
df = pl.read_csv(combined_file, separator='\t')
|
|
499
|
+
if self.logger:
|
|
500
|
+
self.logger.silent_log(f"Found {len(df)} rows in combined file")
|
|
501
|
+
self.logger.silent_log(f"Column names: {df.columns}")
|
|
502
|
+
if seq_type == 'nuc':
|
|
503
|
+
coords = df.select([
|
|
504
|
+
'Translated_contig_name (frame)',
|
|
505
|
+
'RdRp_from(AA)',
|
|
506
|
+
'RdRp_to(AA)'
|
|
507
|
+
]).rows()
|
|
508
|
+
elif seq_type == 'prot':
|
|
509
|
+
coords = df.select([
|
|
510
|
+
'Contig_name',
|
|
511
|
+
'RdRp_from(AA)',
|
|
512
|
+
'RdRp_to(AA)'
|
|
513
|
+
]).rows()
|
|
514
|
+
|
|
515
|
+
if self.logger:
|
|
516
|
+
self.logger.silent_log(f"Extracted {len(coords)} coordinate sets")
|
|
517
|
+
self.logger.silent_log(f"First few coordinates: {coords[:3]}")
|
|
518
|
+
return coords
|
|
519
|
+
|
|
520
|
+
class hmmsearch_combiner:
|
|
521
|
+
|
|
522
|
+
def __init__(self, hmmsearch_files, combined_file, logger=None):
|
|
523
|
+
"""
|
|
524
|
+
Constructor for the hmmsearch_combiner class.
|
|
525
|
+
|
|
526
|
+
:param hmmsearch_files: List of paths to the hmmsearch output files.
|
|
527
|
+
:type hmmsearch_files: list
|
|
528
|
+
:param combined_file: Path to the combined output file.
|
|
529
|
+
:type combined_file: str
|
|
530
|
+
:param logger: Logger instance for output
|
|
531
|
+
:type logger: utils.Logger
|
|
532
|
+
"""
|
|
533
|
+
self.hmmsearch_files = hmmsearch_files
|
|
534
|
+
self.combined_file = combined_file
|
|
535
|
+
self.logger = logger
|
|
536
|
+
self.combine_files(self.hmmsearch_files, self.combined_file)
|
|
537
|
+
|
|
538
|
+
def combine_files(self, hmmsearch_files, combined_file):
|
|
539
|
+
"""
|
|
540
|
+
Combines multiple hmmsearch output files into a single file.
|
|
541
|
+
|
|
542
|
+
:param hmmsearch_files: List of paths to the hmmsearch output files.
|
|
543
|
+
:type hmmsearch_files: list
|
|
544
|
+
:param combined_file: Path to the combined output file.
|
|
545
|
+
:type combined_file: str
|
|
546
|
+
:return: Path to the combined output file.
|
|
547
|
+
:rtype: str
|
|
548
|
+
"""
|
|
549
|
+
# Read and process each file
|
|
550
|
+
processed_dfs = []
|
|
551
|
+
if self.logger:
|
|
552
|
+
self.logger.silent_log(f"Processing {len(hmmsearch_files)} hmmsearch output files")
|
|
553
|
+
|
|
554
|
+
for f in hmmsearch_files:
|
|
555
|
+
if self.logger:
|
|
556
|
+
self.logger.silent_log(f"Processing file: {f}")
|
|
557
|
+
|
|
558
|
+
df = pl.read_csv(f, separator='\t')
|
|
559
|
+
# Extract database name from filename
|
|
560
|
+
db_name = Path(f).stem.split('_hmm_output')[0].split('_')[-1]
|
|
561
|
+
|
|
562
|
+
# Add database name
|
|
563
|
+
df = df.with_columns([
|
|
564
|
+
pl.lit(db_name).alias('db_name')
|
|
565
|
+
])
|
|
566
|
+
|
|
567
|
+
# Get total hits per contig
|
|
568
|
+
hit_counts = df.groupby('Contig_name').agg(
|
|
569
|
+
pl.count().alias('Total_positive_profiles')
|
|
570
|
+
)
|
|
571
|
+
df = df.join(hit_counts, on='Contig_name')
|
|
572
|
+
|
|
573
|
+
if self.logger:
|
|
574
|
+
self.logger.silent_log(f"Found {len(df)} hits for database {db_name}")
|
|
575
|
+
|
|
576
|
+
processed_dfs.append(df)
|
|
577
|
+
|
|
578
|
+
# Combine all processed DataFrames
|
|
579
|
+
combined_df = pl.concat(processed_dfs)
|
|
580
|
+
if self.logger:
|
|
581
|
+
self.logger.silent_log(f"Combined {len(processed_dfs)} dataframes with total {len(combined_df)} rows")
|
|
582
|
+
|
|
583
|
+
# Write combined DataFrame to file
|
|
584
|
+
combined_df.write_csv(combined_file, separator='\t')
|
|
585
|
+
if self.logger:
|
|
586
|
+
self.logger.silent_log(f"Written combined results to: {combined_file}")
|
|
587
|
+
|
|
588
|
+
return combined_file
|
|
589
|
+
|