rdrpcatch 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,589 @@
1
+ import polars as pl
2
+ import re
3
+ from pathlib import Path
4
+
5
+
6
+
7
+ def calculate_true_coverage(starts: list, ends: list) -> int:
8
+ """Optimized coverage calculation using interval merging
9
+
10
+ :param starts: List of start positions
11
+ :type starts: list
12
+ :param ends: List of end positions
13
+ :type ends: list
14
+ :return: Total coverage
15
+ :rtype: int
16
+ """
17
+
18
+ if not starts:
19
+ return 0
20
+
21
+ intervals = sorted(zip(starts, ends))
22
+ merged = []
23
+ current_start, current_end = intervals[0]
24
+
25
+ for start, end in intervals[1:]:
26
+ if start <= current_end + 1: # Handle adjacent ranges
27
+ current_end = max(current_end, end)
28
+ else:
29
+ merged.append((current_start, current_end))
30
+ current_start, current_end = start, end
31
+
32
+ merged.append((current_start, current_end))
33
+ return sum(end - start + 1 for start, end in merged)
34
+
35
+
36
+ class hmmsearch_formatter:
37
+ """
38
+ Class for parsing hmmsearch output files.
39
+
40
+ Attributes:
41
+ data (dict): A dictionary containing the parsed data from the hmmscan output file.
42
+ hmm_output_file (str): Path to the hmmscan output file.
43
+
44
+ Methods:
45
+ parse_output(hmm_output_file): Parses the hmmsearch output file and returns a dictionary.
46
+ calculate_coverage(data): Calculates the coverage of all domains in a profile.
47
+ get_contig(contig_name): Returns all profiles and domains for a given contig.
48
+ export_processed_file(data, outfile, p_cov_threshold=0): Exports the processed hmmscan output file.
49
+ """
50
+
51
+ def __init__(self, hmm_raw, hmm_processed, seq_type):
52
+ """
53
+ Constructor for the hmmsearch_parser class.
54
+
55
+ :param hmm_raw: Path to the raw hmmsearch output file.
56
+ :type hmm_raw: str
57
+ :param hmm_processed: Path to the processed output file.
58
+ :type hmm_processed: str
59
+
60
+ If PROTEIN: contig name is the first column
61
+ If DNA: contig name is the last column, first column is the translated sequence name (e.g. contig_name_frame)
62
+ """
63
+ self.hmm_output_file = hmm_raw
64
+ hmm_custom = str(hmm_raw.with_suffix('.custom.tsv'))
65
+
66
+ # Parse and process the data using Polars DataFrame operations
67
+ data_df = pl.read_csv(hmm_custom, separator='\t')
68
+ # Check if the dataframe is empty
69
+ if data_df.is_empty():
70
+ title_line= ['Contig_name', 'Translated_contig_name (frame)', 'Sequence_length(AA)', 'Profile_name',
71
+ 'Profile_length', 'E-value', 'score', 'acc', 'norm_bitscore_profile',
72
+ 'norm_bitscore_contig', 'ID_score', 'RdRp_from(AA)', 'RdRp_to(AA)', 'profile_coverage',
73
+ 'contig_coverage']
74
+ data_df = pl.DataFrame({col: [] for col in title_line})
75
+ data_df.write_csv(hmm_processed, separator="\t")
76
+ else:
77
+ data_df = self.calculate_norm_bitscore_profile(data_df)
78
+ data_df = self.calculate_norm_bitscore_contig(data_df)
79
+ data_df = self.calculate_coverage_stats(data_df)
80
+
81
+
82
+ if seq_type == 'prot':
83
+ self.export_processed_file_aa(data_df, hmm_processed)
84
+ elif seq_type == 'nuc':
85
+ self.export_processed_file_dna(data_df, hmm_processed)
86
+
87
+
88
+ def calculate_norm_bitscore_profile(self, data_df):
89
+ """
90
+ Calculates the normalized bitscore for each profile.
91
+
92
+ :param data_df: Dictionary containing the parsed data.
93
+ :type data: dict
94
+ :return: Dictionary containing the parsed data with normalized bitscores.
95
+ :rtype: dataframe
96
+ """
97
+ data_df = (data_df.with_columns([
98
+ # Normalized bitscores
99
+ (pl.col('score') / pl.col('qlen')).alias('norm_bitscore_profile')]))
100
+ return data_df
101
+
102
+ def calculate_norm_bitscore_contig(self, data_df):
103
+ """
104
+ Calculates the normalized bitscore for each contig.
105
+
106
+ :param data_df: Dictionary containing the parsed data.
107
+ :type data: dict
108
+ :return: Dictionary containing the parsed data with normalized bitscores.
109
+ :rtype: dataframe
110
+ """
111
+ data_df = (data_df.with_columns([
112
+ # Normalized bitscores
113
+ (pl.col('score') / pl.col('tlen')).alias('norm_bitscore_contig')]))
114
+ return data_df
115
+
116
+ def calculate_coverage_stats(self, data_df):
117
+ """
118
+ Calculates the coverage statistics for each profile.
119
+
120
+ :param data: Dictionary containing the parsed data.
121
+ :type data: dict
122
+ :return: Dictionary containing the parsed data with coverage statistics.
123
+ :rtype: dict
124
+ """
125
+
126
+ df = data_df.with_columns(
127
+ pl.col("env_from").cast(pl.Int64),
128
+ pl.col("env_to").cast(pl.Int64),
129
+ pl.col("hmm_from").cast(pl.Int64),
130
+ pl.col("hmm_to").cast(pl.Int64),
131
+ pl.col("ali_from").cast(pl.Int64),
132
+ pl.col("ali_to").cast(pl.Int64)
133
+ )
134
+
135
+ stats_df = (
136
+ df
137
+ .with_row_index("row_id")
138
+ .join(
139
+ df.group_by(["t_name", "q_name"])
140
+ .agg(
141
+ pl.col("env_from").alias("starts"),
142
+ pl.col("env_to").alias("ends"),
143
+ pl.col("hmm_from").alias("hmm_starts"),
144
+ pl.col("hmm_to").alias("hmm_ends"),
145
+ pl.col("ali_from").alias("ali_starts"),
146
+ pl.col("ali_to").alias("ali_ends"),
147
+ pl.col("tlen").first().alias("tlen"),
148
+ pl.col("qlen").first().alias("qlen"),
149
+ pl.col("score").first().alias("score"),
150
+ pl.col("env_from").min().alias("RdRp_start"),
151
+ pl.col("env_to").max().alias("RdRp_end"),
152
+ pl.len().alias("row_count")
153
+ )
154
+ .with_columns(
155
+ contig_coverage=pl.when(pl.col("row_count") == 1)
156
+ .then(pl.col("ends").list.first() - pl.col("starts").list.first() + 1)
157
+ .otherwise(
158
+ pl.struct(["starts", "ends"])
159
+ .map_elements(lambda x: calculate_true_coverage(x["starts"], x["ends"]),return_dtype=pl.Int64)
160
+ ),
161
+ profile_coverage=pl.when(pl.col("row_count") == 1)
162
+ .then(pl.col("hmm_ends").list.first() - pl.col("hmm_starts").list.first() + 1)
163
+ .otherwise(
164
+ pl.struct(["hmm_starts", "hmm_ends"])
165
+ .map_elements(lambda x: calculate_true_coverage(x["hmm_starts"], x["hmm_ends"]),return_dtype=pl.Int64)
166
+ ),
167
+ aligned_coverage=pl.when(pl.col("row_count") == 1)
168
+ .then(pl.col("ali_ends").list.first() - pl.col("ali_starts").list.first() + 1)
169
+ .otherwise(
170
+ pl.struct(["ali_starts", "ali_ends"])
171
+ .map_elements(lambda x: calculate_true_coverage(x["ali_starts"], x["ali_ends"]),return_dtype=pl.Int64)
172
+ )
173
+ )
174
+ .with_columns(
175
+ contig_coverage=(pl.col("contig_coverage") / pl.col("tlen")).alias("contig_coverage"),
176
+ profile_coverage=(pl.col("profile_coverage") / pl.col("qlen")).alias("profile_coverage"),
177
+ ID_score=(pl.col("score") / pl.col("aligned_coverage")).alias("ID_score")
178
+ )
179
+ .select(
180
+ ["t_name", "q_name", "contig_coverage", "profile_coverage", "ID_score", "RdRp_start", "RdRp_end"]),
181
+ on=["t_name", "q_name"]
182
+ )
183
+ .sort("row_id")
184
+ .drop("row_id")
185
+ )
186
+ # Group by contig and profile name, keep the first occurrence of all columns
187
+ stats_df = (
188
+ stats_df
189
+ .group_by(["t_name", "q_name"])
190
+ .agg(
191
+ pl.col("*").first() # Keep the first occurrence of all columns
192
+ )
193
+ .sort(["t_name", "q_name"])
194
+ )
195
+
196
+ return stats_df
197
+
198
+
199
+ def export_processed_file_aa(self, data_df, outfile):
200
+ """
201
+ Exports the processed hmmsearch output file for protein sequences.
202
+
203
+ :param data_df: Polars DataFrame containing the parsed data.
204
+ :type data_df: pl.DataFrame
205
+ :param outfile: Path to the output file.
206
+ :type outfile: str
207
+ :return: None
208
+ """
209
+ # Select and rename columns for output
210
+ output_df = data_df.select([
211
+ pl.col('t_name').alias('Contig_name'),
212
+ pl.lit("-").alias('Translated_contig_name (frame)'),
213
+ pl.col('tlen').alias('Sequence_length(AA)'),
214
+ pl.col('q_name').alias('Profile_name'),
215
+ pl.col('qlen').alias('Profile_length'),
216
+ pl.col('E-value'),
217
+ pl.col('score'),
218
+ # pl.col("acc").alias("hmm_accuracy"),
219
+ pl.col('norm_bitscore_profile'),
220
+ pl.col('norm_bitscore_contig'),
221
+ pl.col('ID_score'),
222
+ pl.col('RdRp_start').alias('RdRp_from(AA)'),
223
+ pl.col('RdRp_end').alias('RdRp_to(AA)'),
224
+ pl.col('profile_coverage'),
225
+ pl.col('contig_coverage')
226
+ ])
227
+
228
+ output_df.write_csv(outfile, separator="\t")
229
+
230
+ def export_processed_file_dna(self, data_df, outfile):
231
+ """
232
+ Exports the processed hmmsearch output file for DNA sequences.
233
+
234
+ :param data_df: Polars DataFrame containing the parsed data.
235
+ :type data_df: pl.DataFrame
236
+ :param outfile: Path to the output file.
237
+ :type outfile: str
238
+ :return: None
239
+ """
240
+ # Extract contig name and frame from translated sequence name
241
+ output_df = (data_df
242
+ .with_columns([
243
+ pl.col('t_name').str.extract(r'(.+)_frame=[-]?\d').alias('Contig_name'),
244
+ pl.col('t_name').alias('Translated_contig_name (frame)')
245
+ ])
246
+ .select([
247
+ pl.col('Contig_name'),
248
+ pl.col('Translated_contig_name (frame)'),
249
+ pl.col('tlen').alias('Sequence_length(AA)'),
250
+ pl.col('q_name').alias('Profile_name'),
251
+ pl.col('qlen').alias('Profile_length'),
252
+ pl.col('E-value'),
253
+ pl.col('score'),
254
+ # pl.col("acc").alias("hmm_accuracy"),
255
+ pl.col('norm_bitscore_profile'),
256
+ pl.col('norm_bitscore_contig'),
257
+ pl.col('ID_score'),
258
+ pl.col('RdRp_start').alias('RdRp_from(AA)'),
259
+ pl.col('RdRp_end').alias('RdRp_to(AA)'),
260
+ pl.col('profile_coverage'),
261
+ pl.col('contig_coverage')
262
+ ]))
263
+ output_df.write_csv(outfile, separator="\t")
264
+
265
+ class hmmsearch_format_helpers:
266
+
267
+ def __init__(self, hmm_outfn, seq_type, logger=None):
268
+ self.hmm_outfn = hmm_outfn
269
+ self.seq_type = seq_type
270
+ self.logger = logger
271
+
272
+ def hmm_to_contig_set(self):
273
+ """
274
+ Returns a set of all contig names in the data.
275
+
276
+ :return: Set of contig names.
277
+ :rtype: set
278
+ """
279
+ df = pl.read_csv(self.hmm_outfn, separator='\t')
280
+ if self.seq_type == 'nuc':
281
+ result = set(df['Contig_name'].unique())
282
+ elif self.seq_type == 'prot':
283
+ result = set(df['Translated_contig_name (frame)'].unique())
284
+ if self.logger:
285
+ self.logger.silent_log(f"Found {len(result)} unique contigs")
286
+ return result
287
+
288
+ def highest_bitscore_hits(self, filtered_file):
289
+ """
290
+ Filters the hmmsearch output file based on the highest bitscore for each contig.
291
+
292
+ :param filtered_file: Path to the filtered output file.
293
+ :type filtered_file: str
294
+ :return: None
295
+ """
296
+ df = pl.read_csv(self.hmm_outfn, separator='\t')
297
+ if self.logger:
298
+ self.logger.silent_log(f"Processing {len(df)} hits for highest bitscore")
299
+
300
+ # Get total hits per contig
301
+ hit_counts = df.group_by('Contig_name').agg(
302
+ pl.count().alias('Total_positive_profiles')
303
+ )
304
+
305
+ # Get best hits by score
306
+ best_hits = df.join(hit_counts, on='Contig_name').sort('score', descending=True).group_by('Contig_name').first()
307
+
308
+ if self.logger:
309
+ self.logger.silent_log(f"Found {len(best_hits)} best hits")
310
+
311
+ best_hits.write_csv(filtered_file, separator='\t')
312
+
313
+ def highest_norm_bit_prof_hits(self, filtered_file):
314
+ """
315
+ Filters the hmmsearch output file based on the highest normalized bitscore for each contig.
316
+
317
+ :param filtered_file: Path to the filtered output file.
318
+ :type filtered_file: str
319
+ :return: None
320
+ """
321
+ df = pl.read_csv(self.hmm_outfn, separator='\t')
322
+ if self.logger:
323
+ self.logger.silent_log(f"Processing {len(df)} hits for highest normalized bitscore")
324
+
325
+ # Get best hits by normalized bitscore
326
+ best_hits = df.sort('norm_bitscore_profile', descending=True).group_by('Contig_name').first()
327
+
328
+ if self.logger:
329
+ self.logger.silent_log(f"Found {len(best_hits)} best hits")
330
+
331
+ best_hits.write_csv(filtered_file, separator='\t')
332
+
333
+ def lowest_evalue_hits(self, filtered_file):
334
+ """
335
+ Filters the hmmsearch output file based on the lowest E-value for each contig.
336
+
337
+ :param filtered_file: Path to the filtered output file.
338
+ :type filtered_file: str
339
+ :return: None
340
+ """
341
+ df = pl.read_csv(self.hmm_outfn, separator='\t')
342
+ if self.logger:
343
+ self.logger.silent_log(f"Processing {len(df)} hits for lowest E-value")
344
+
345
+ # Get best hits by lowest E-value
346
+ best_hits = df.sort('E-value').group_by('Contig_name').first()
347
+
348
+ if self.logger:
349
+ self.logger.silent_log(f"Found {len(best_hits)} best hits")
350
+
351
+ best_hits.write_csv(filtered_file, separator='\t')
352
+
353
+ def extract_col(self, index):
354
+ """
355
+ Extracts a column from the hmmsearch output file based on index.
356
+
357
+ :param index: Index of the column to extract.
358
+ :type index: int
359
+ :return: List of values from the specified column.
360
+ :rtype: list
361
+ """
362
+ df = pl.read_csv(self.hmm_outfn, separator='\t')
363
+ return df.select(df.columns[index]).to_series().to_list()
364
+
365
+ class hmmsearch_output_writter:
366
+
367
+ def __init__(self, logger=None):
368
+ """
369
+ Constructor for the hmmsearch_output_writter class.
370
+
371
+ :param logger: Logger instance for output
372
+ :type logger: utils.Logger
373
+ """
374
+ self.logger = logger
375
+
376
+ def write_hmmsearch_hits(self, hmmsearch_out_file, seq_type, rdrpcatch_out, gff_out):
377
+ """
378
+ Writes the hmmsearch hits to a GFF file.
379
+
380
+ :param hmmsearch_out_file: Path to the hmmsearch output file.
381
+ :type hmmsearch_out_file: str
382
+ :param seq_type: Type of sequence (prot or nuc).
383
+ :type seq_type: str
384
+ :param rdrpcatch_out: Path to the RdRpCATCH output file.
385
+ :type rdrpcatch_out: str
386
+ :param gff_out: Path to the GFF output file.
387
+ :type gff_out: str
388
+ :return: None
389
+ """
390
+ from .utils import write_combined_results_to_gff, convert_record_to_gff3_record
391
+
392
+ df = pl.read_csv(hmmsearch_out_file, separator='\t')
393
+
394
+ grouped = df.group_by("Contig_name").agg(
395
+ pl.concat_str(
396
+ [
397
+ pl.col("db_name"),
398
+ pl.col("Total_positive_profiles").cast(str)
399
+ ],
400
+ separator="="
401
+ ).str.join(";").alias("Total_databases_that_the_contig_was_detected(No_of_Profiles)")
402
+ )
403
+ # Group by contig name and get the max score
404
+ max_scores = df.group_by("Contig_name").agg(pl.max("score"))
405
+ # Join the max scores and the grouped columns
406
+ result_df = df.join(max_scores, on=["Contig_name", "score"]).join(grouped, on="Contig_name")
407
+ # Drop the Total_positive_profiles column
408
+ result_df = result_df.unique("Contig_name").drop("Total_positive_profiles")
409
+
410
+
411
+ # Rename the columns
412
+ result_df = result_df.with_columns(pl.col("db_name").alias("Best_hit_Database"))
413
+ result_df = result_df.with_columns(pl.col("Profile_name").alias("Best_hit_profile_name"))
414
+ result_df = result_df.with_columns(pl.col("Profile_length").alias("Best_hit_profile_length"))
415
+ result_df = result_df.with_columns(pl.col("E-value").alias("Best_hit_e-value"))
416
+ result_df = result_df.with_columns(pl.col("score").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_bitscore"))
417
+ result_df = result_df.with_columns(pl.col("profile_coverage").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_profile_coverage"))
418
+ result_df = result_df.with_columns(pl.col("contig_coverage").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_contig_coverage"))
419
+ result_df = result_df.with_columns(pl.col("norm_bitscore_profile").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_norm_bitscore_profile"))
420
+ result_df = result_df.with_columns(pl.col("norm_bitscore_contig").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_norm_bitscore_contig"))
421
+ result_df = result_df.with_columns(pl.col("ID_score").map_elements(lambda x: f"{x:.3f}", return_dtype=pl.Utf8).alias("Best_hit_ID_score"))
422
+
423
+ # Reorder the columns
424
+ column_order = ["Contig_name", "Translated_contig_name (frame)",
425
+ "Sequence_length(AA)", "Total_databases_that_the_contig_was_detected(No_of_Profiles)",
426
+ "Best_hit_Database", "Best_hit_profile_name", "Best_hit_profile_length", "Best_hit_e-value",
427
+ "Best_hit_bitscore", "RdRp_from(AA)", "RdRp_to(AA)", "Best_hit_profile_coverage",
428
+ "Best_hit_contig_coverage", "Best_hit_norm_bitscore_profile", "Best_hit_norm_bitscore_contig",
429
+ "Best_hit_ID_score"]
430
+
431
+ result_df = result_df.select(column_order)
432
+
433
+ # Write the RdRpCATCH output file first
434
+ result_df.write_csv(rdrpcatch_out, separator='\t')
435
+
436
+
437
+ # Create GFF format with attributes as a struct
438
+ write_combined_results_to_gff(gff_out, result_df,seq_type)
439
+ # print(df.columns)
440
+ # gff_df = df.with_columns([
441
+ # pl.col('Contig_name'),
442
+ # pl.col('db_name').alias('source'),
443
+ # pl.lit('protein_match').alias('type'),
444
+ # pl.col('RdRp_from(AA)'),
445
+ # pl.col('RdRp_to(AA)'),
446
+ # pl.col('score'),
447
+ # pl.lit('+').alias('strand'),
448
+ # pl.lit('.').alias('phase')])
449
+ # print(gff_df)
450
+ # print(gff_df.columns)
451
+ # with open(gff_out, 'w') as out_handle:
452
+ # out_handle.write('##gff-version 3\n')
453
+ # for row in gff_df.iter_rows(named=True):
454
+ # # print(row)
455
+ # # print(row['Contig_name'])
456
+ # gff_line = "\t".join(
457
+ # [row['Contig_name'],
458
+ # row['source'],
459
+ # row['type'],
460
+ # row['RdRp_from(AA)'],
461
+ # row['RdRp_to(AA)'],
462
+ # row['score'],
463
+ # row['strand'],
464
+ # row['phase'],
465
+ # row['attributes']])
466
+ # out_handle.write(f"{gff_line}\n")
467
+ # gff_df = gff_df.with_columns([
468
+ # pl.struct([
469
+ # pl.col('Contig_name'),
470
+ # pl.col('Profile_name'),
471
+ # pl.col('E-value').cast(pl.Utf8),
472
+ # pl.col('score').cast(pl.Utf8),
473
+ # pl.col('profile_coverage').cast(pl.Utf8),
474
+ # pl.col('contig_coverage').cast(pl.Utf8),
475
+ # pl.col('ID_score').cast(pl.Utf8)
476
+ # ]).map_elements(lambda x: f"ID=RdRp_{x[0]};Profile={x[1]};E-value={x[2]};score={x[3]};profile_coverage={x[4]};contig_coverage={x[5]};ID_score={x[6]}").alias('attributes')
477
+ # ])
478
+
479
+ # # Write GFF file
480
+ # with open(gff_out, 'w') as out_handle:
481
+ # out_handle.write('##gff-version 3\n')
482
+ # gff_df.write_csv(out_handle, separator='\t', has_header=False)
483
+
484
+ def get_rdrp_coords(self, rdrpcatch_out, seq_type):
485
+ """
486
+ Gets the RdRp coordinates from the RdRpCATCH output file.
487
+
488
+ :param rdrpcatch_out: Path to the RdRpCATCH output file.
489
+ :type rdrpcatch_out: str
490
+ :return: List of tuples containing contig name and RdRp coordinates.
491
+ :rtype: list
492
+ """
493
+ # Convert the path to use combined.tsv instead of rdrpcatch_output.tsv
494
+ combined_file = str(Path(rdrpcatch_out).parent / Path(rdrpcatch_out))
495
+ if self.logger:
496
+ self.logger.silent_log(f"Reading coordinates from {combined_file}")
497
+
498
+ df = pl.read_csv(combined_file, separator='\t')
499
+ if self.logger:
500
+ self.logger.silent_log(f"Found {len(df)} rows in combined file")
501
+ self.logger.silent_log(f"Column names: {df.columns}")
502
+ if seq_type == 'nuc':
503
+ coords = df.select([
504
+ 'Translated_contig_name (frame)',
505
+ 'RdRp_from(AA)',
506
+ 'RdRp_to(AA)'
507
+ ]).rows()
508
+ elif seq_type == 'prot':
509
+ coords = df.select([
510
+ 'Contig_name',
511
+ 'RdRp_from(AA)',
512
+ 'RdRp_to(AA)'
513
+ ]).rows()
514
+
515
+ if self.logger:
516
+ self.logger.silent_log(f"Extracted {len(coords)} coordinate sets")
517
+ self.logger.silent_log(f"First few coordinates: {coords[:3]}")
518
+ return coords
519
+
520
+ class hmmsearch_combiner:
521
+
522
+ def __init__(self, hmmsearch_files, combined_file, logger=None):
523
+ """
524
+ Constructor for the hmmsearch_combiner class.
525
+
526
+ :param hmmsearch_files: List of paths to the hmmsearch output files.
527
+ :type hmmsearch_files: list
528
+ :param combined_file: Path to the combined output file.
529
+ :type combined_file: str
530
+ :param logger: Logger instance for output
531
+ :type logger: utils.Logger
532
+ """
533
+ self.hmmsearch_files = hmmsearch_files
534
+ self.combined_file = combined_file
535
+ self.logger = logger
536
+ self.combine_files(self.hmmsearch_files, self.combined_file)
537
+
538
+ def combine_files(self, hmmsearch_files, combined_file):
539
+ """
540
+ Combines multiple hmmsearch output files into a single file.
541
+
542
+ :param hmmsearch_files: List of paths to the hmmsearch output files.
543
+ :type hmmsearch_files: list
544
+ :param combined_file: Path to the combined output file.
545
+ :type combined_file: str
546
+ :return: Path to the combined output file.
547
+ :rtype: str
548
+ """
549
+ # Read and process each file
550
+ processed_dfs = []
551
+ if self.logger:
552
+ self.logger.silent_log(f"Processing {len(hmmsearch_files)} hmmsearch output files")
553
+
554
+ for f in hmmsearch_files:
555
+ if self.logger:
556
+ self.logger.silent_log(f"Processing file: {f}")
557
+
558
+ df = pl.read_csv(f, separator='\t')
559
+ # Extract database name from filename
560
+ db_name = Path(f).stem.split('_hmm_output')[0].split('_')[-1]
561
+
562
+ # Add database name
563
+ df = df.with_columns([
564
+ pl.lit(db_name).alias('db_name')
565
+ ])
566
+
567
+ # Get total hits per contig
568
+ hit_counts = df.groupby('Contig_name').agg(
569
+ pl.count().alias('Total_positive_profiles')
570
+ )
571
+ df = df.join(hit_counts, on='Contig_name')
572
+
573
+ if self.logger:
574
+ self.logger.silent_log(f"Found {len(df)} hits for database {db_name}")
575
+
576
+ processed_dfs.append(df)
577
+
578
+ # Combine all processed DataFrames
579
+ combined_df = pl.concat(processed_dfs)
580
+ if self.logger:
581
+ self.logger.silent_log(f"Combined {len(processed_dfs)} dataframes with total {len(combined_df)} rows")
582
+
583
+ # Write combined DataFrame to file
584
+ combined_df.write_csv(combined_file, separator='\t')
585
+ if self.logger:
586
+ self.logger.silent_log(f"Written combined results to: {combined_file}")
587
+
588
+ return combined_file
589
+