mgnify-pipelines-toolkit 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -17,8 +17,19 @@
17
17
 
18
18
  import re
19
19
  import sys
20
+ import fileinput
20
21
 
21
- from mgnify_pipelines_toolkit.constants.thresholds import EVALUE_CUTOFF_IPS, EVALUE_CUTOFF_EGGNOG
22
+ from mgnify_pipelines_toolkit.constants.thresholds import (
23
+ EVALUE_CUTOFF_IPS,
24
+ EVALUE_CUTOFF_EGGNOG,
25
+ )
26
+
27
+ DBCAN_CLASSES_DICT = {
28
+ "TC": "dbcan_transporter_classification",
29
+ "TF": "dbcan_transcription_factor",
30
+ "STP": "dbcan_signal_transduction_prot",
31
+ "CAZyme": "dbcan_prot_family",
32
+ }
22
33
 
23
34
 
24
35
  def get_iprs(ipr_annot):
@@ -26,7 +37,8 @@ def get_iprs(ipr_annot):
26
37
  antifams = list()
27
38
  if not ipr_annot:
28
39
  return iprs, antifams
29
- with open(ipr_annot) as f:
40
+ with fileinput.hook_compressed(ipr_annot, "r", encoding="utf-8") as f:
41
+
30
42
  for line in f:
31
43
  cols = line.strip().split("\t")
32
44
  protein = cols[0]
@@ -55,7 +67,8 @@ def get_eggnog(eggnog_annot):
55
67
  eggnogs = {}
56
68
  if not eggnog_annot:
57
69
  return eggnogs
58
- with open(eggnog_annot, "r") as f:
70
+ with fileinput.hook_compressed(eggnog_annot, "r", encoding="utf-8") as f:
71
+
59
72
  for line in f:
60
73
  line = line.rstrip()
61
74
  cols = line.split("\t")
@@ -104,7 +117,8 @@ def get_bgcs(bgc_file, prokka_gff, tool):
104
117
  return bgc_annotations
105
118
  # save positions of each BGC cluster to dictionary cluster_positions
106
119
  # and save the annotations to dictionary bgc_result
107
- with open(bgc_file, "r") as bgc_in:
120
+ with fileinput.hook_compressed(bgc_file, "r", encoding="utf-8") as bgc_in:
121
+
108
122
  for line in bgc_in:
109
123
  if not line.startswith("#"):
110
124
  (
@@ -138,7 +152,7 @@ def get_bgcs(bgc_file, prokka_gff, tool):
138
152
  type_value = ""
139
153
  as_product = ""
140
154
  for a in annotations.split(
141
- ";"
155
+ ";"
142
156
  ): # go through all parts of the annotation field
143
157
  if a.startswith("as_type="):
144
158
  type_value = a.split("=")[1]
@@ -170,9 +184,12 @@ def get_bgcs(bgc_file, prokka_gff, tool):
170
184
  {"bgc_function": type_value},
171
185
  )
172
186
  if as_product:
173
- tool_result[contig]["_".join([start_pos, end_pos])]["bgc_product"] = as_product
187
+ tool_result[contig]["_".join([start_pos, end_pos])][
188
+ "bgc_product"
189
+ ] = as_product
174
190
  # identify CDSs that fall into each of the clusters annotated by the BGC tool
175
- with open(prokka_gff, "r") as gff_in:
191
+ with fileinput.hook_compressed(prokka_gff, "r", encoding="utf-8") as gff_in:
192
+
176
193
  for line in gff_in:
177
194
  if not line.startswith("#"):
178
195
  matching_interval = ""
@@ -228,8 +245,9 @@ def get_bgcs(bgc_file, prokka_gff, tool):
228
245
  },
229
246
  )
230
247
  if "bgc_product" in tool_result[contig][matching_interval]:
231
- bgc_annotations[cds_id]["antismash_product"] = tool_result[contig][matching_interval][
232
- "bgc_product"]
248
+ bgc_annotations[cds_id]["antismash_product"] = tool_result[
249
+ contig
250
+ ][matching_interval]["bgc_product"]
233
251
  elif line.startswith("##FASTA"):
234
252
  break
235
253
  return bgc_annotations
@@ -239,7 +257,7 @@ def get_amr(amr_file):
239
257
  amr_annotations = {}
240
258
  if not amr_file:
241
259
  return amr_annotations
242
- with open(amr_file, "r") as f:
260
+ with fileinput.hook_compressed(amr_file, "r", encoding="utf-8") as f:
243
261
  for line in f:
244
262
  if line.startswith("Protein identifier"):
245
263
  continue
@@ -286,7 +304,7 @@ def get_dbcan(dbcan_file):
286
304
  substrates = dict()
287
305
  if not dbcan_file:
288
306
  return dbcan_annotations
289
- with open(dbcan_file, "r") as f:
307
+ with fileinput.hook_compressed(dbcan_file, "r", encoding="utf-8") as f:
290
308
  for line in f:
291
309
  if "predicted PUL" in line:
292
310
  annot_fields = line.strip().split("\t")[8].split(";")
@@ -314,13 +332,45 @@ def get_dbcan(dbcan_file):
314
332
  elif a.startswith("Parent"):
315
333
  parent = a.split("=")[1]
316
334
  dbcan_annotations[acc] = (
317
- "dbcan_prot_type={};dbcan_prot_family={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
335
+ "dbcan_prot_type={};{}={};substrate_dbcan-pul={};substrate_dbcan-sub={}".format(
318
336
  prot_type,
337
+ DBCAN_CLASSES_DICT[prot_type],
319
338
  prot_fam,
320
339
  substrates[parent]["substrate_pul"],
321
340
  substrates[parent]["substrate_ecami"],
322
341
  )
323
342
  )
343
+
344
+ return dbcan_annotations
345
+
346
+
347
+ def get_dbcan_individual_cazys(dbcan_cazys_file):
348
+ dbcan_annotations = dict()
349
+ if not dbcan_cazys_file:
350
+ return dbcan_annotations
351
+ with fileinput.hook_compressed(dbcan_cazys_file, "r", encoding="utf-8") as f:
352
+ for line in f:
353
+ if line.startswith("#"):
354
+ continue
355
+ attributes = line.strip().split("\t")[8]
356
+ attributes_dict = dict(
357
+ re.split(r"(?<!\\)=", item)
358
+ for item in re.split(r"(?<!\\);", attributes.rstrip(";"))
359
+ )
360
+ if "num_tools" in attributes_dict and int(attributes_dict["num_tools"]) < 2:
361
+ continue # don't keep annotations supported by only one tool within dbcan
362
+ cds_pattern = r"\.CDS\d+$"
363
+ protein = re.sub(
364
+ cds_pattern, "", attributes_dict["ID"]
365
+ ) # remove the CDS number
366
+ annotation_text = "dbcan_prot_type=CAZyme;"
367
+ for field in ["protein_family", "substrate_dbcan-sub", "eC_number"]:
368
+ if field in attributes_dict:
369
+ annotation_text += (
370
+ f"{'dbcan_prot_family' if field == 'protein_family' else field}"
371
+ f"={attributes_dict[field]};"
372
+ )
373
+ dbcan_annotations[protein] = annotation_text.strip(";")
324
374
  return dbcan_annotations
325
375
 
326
376
 
@@ -329,7 +379,8 @@ def get_defense_finder(df_file):
329
379
  type_info = dict()
330
380
  if not df_file:
331
381
  return defense_finder_annotations
332
- with open(df_file, "r") as f:
382
+ with fileinput.hook_compressed(df_file, "r", encoding="utf-8") as f:
383
+
333
384
  for line in f:
334
385
  if "Anti-phage system" in line:
335
386
  annot_fields = line.strip().split("\t")[8].split(";")
@@ -366,6 +417,7 @@ def load_annotations(
366
417
  antismash_file,
367
418
  gecco_file,
368
419
  dbcan_file,
420
+ dbcan_cazys_file,
369
421
  defense_finder_file,
370
422
  pseudofinder_file,
371
423
  ):
@@ -376,6 +428,7 @@ def load_annotations(
376
428
  antismash_bgcs = get_bgcs(antismash_file, in_gff, tool="antismash")
377
429
  amr_annotations = get_amr(amr_file)
378
430
  dbcan_annotations = get_dbcan(dbcan_file)
431
+ dbcan_cazys_annotations = get_dbcan_individual_cazys(dbcan_cazys_file)
379
432
  defense_finder_annotations = get_defense_finder(defense_finder_file)
380
433
  pseudogenes = get_pseudogenes(pseudofinder_file)
381
434
  pseudogene_report_dict = dict()
@@ -384,7 +437,7 @@ def load_annotations(
384
437
  header = []
385
438
  fasta = []
386
439
  fasta_flag = False
387
- with open(in_gff) as f:
440
+ with fileinput.hook_compressed(in_gff, "r", encoding="utf-8") as f:
388
441
  for line in f:
389
442
  line = line.strip()
390
443
  if line[0] != "#" and not fasta_flag:
@@ -496,6 +549,11 @@ def load_annotations(
496
549
  added_annot[protein]["dbCAN"] = dbcan_annotations[protein]
497
550
  except KeyError:
498
551
  pass
552
+ try:
553
+ dbcan_cazys_annotations[protein]
554
+ added_annot[protein]["dbCAN"] = dbcan_cazys_annotations[protein]
555
+ except KeyError:
556
+ pass
499
557
  try:
500
558
  defense_finder_annotations[protein]
501
559
  added_annot[protein]["defense_finder"] = (
@@ -530,7 +588,7 @@ def load_annotations(
530
588
  def get_ncrnas(ncrnas_file):
531
589
  ncrnas = {}
532
590
  counts = 0
533
- with open(ncrnas_file, "r") as f:
591
+ with fileinput.hook_compressed(ncrnas_file, "r", encoding="utf-8") as f:
534
592
  for line in f:
535
593
  if not line.startswith("#"):
536
594
  cols = line.strip().split()
@@ -543,7 +601,9 @@ def get_ncrnas(ncrnas_file):
543
601
  # Skip tRNAs, we add them from tRNAscan-SE
544
602
  continue
545
603
  strand = cols[11]
546
- start, end = (int(cols[9]), int(cols[10])) if strand == "+" else (int(cols[10]), int(cols[9]))
604
+ start, end = int(cols[10]), int(cols[9])
605
+ if strand == "+":
606
+ start, end = end, start
547
607
  rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
548
608
  annot = [
549
609
  "ID=" + locus,
@@ -718,7 +778,10 @@ def prepare_rna_gff_fields(cols):
718
778
  }
719
779
 
720
780
  if rna_feature_name == "ncRNA":
721
- ncrna_class = next((rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams), None)
781
+ ncrna_class = next(
782
+ (rna_type for rna_type, rfams in rna_types.items() if cols[2] in rfams),
783
+ None,
784
+ )
722
785
  if not ncrna_class:
723
786
  if "microRNA" in cols[-1]:
724
787
  ncrna_class = "pre_miRNA"
@@ -729,7 +792,7 @@ def prepare_rna_gff_fields(cols):
729
792
 
730
793
  def get_trnas(trnas_file):
731
794
  trnas = {}
732
- with open(trnas_file, "r") as f:
795
+ with fileinput.hook_compressed(trnas_file, "r", encoding="utf-8") as f:
733
796
  for line in f:
734
797
  if not line.startswith("#"):
735
798
  cols = line.split("\t")
@@ -738,13 +801,13 @@ def get_trnas(trnas_file):
738
801
  line = line.replace("tRNAscan-SE", "tRNAscan-SE:2.0.9")
739
802
  trnas.setdefault(contig, dict()).setdefault(
740
803
  int(start), list()
741
- ).append(line.strip())
804
+ ).append(line.strip().strip(";"))
742
805
  return trnas
743
806
 
744
807
 
745
808
  def load_crispr(crispr_file):
746
809
  crispr_annotations = dict()
747
- with open(crispr_file, "r") as f:
810
+ with fileinput.hook_compressed(crispr_file, "r", encoding="utf-8") as f:
748
811
  record = list()
749
812
  left_coord = ""
750
813
  loc_contig = ""
@@ -791,7 +854,7 @@ def get_pseudogenes(pseudofinder_file):
791
854
  pseudogenes = dict()
792
855
  if not pseudofinder_file:
793
856
  return pseudogenes
794
- with open(pseudofinder_file) as file_in:
857
+ with fileinput.hook_compressed(pseudofinder_file, "r", encoding="utf-8") as file_in:
795
858
  for line in file_in:
796
859
  if not line.startswith("#"):
797
860
  col9 = line.strip().split("\t")[8]
@@ -28,6 +28,17 @@ def write_results_to_file(
28
28
  contig_list = check_for_additional_keys(
29
29
  ncrnas, trnas, crispr_annotations, contig_list
30
30
  )
31
+ # sort contigs by digit at the end of contig/genome accession
32
+ if contig_list[0].startswith(
33
+ "MGYG"
34
+ ): # e.g. 'MGYG000500002_1', 'MGYG000500002_2', 'MGYG000500002_3'
35
+ contig_list = sorted(list(contig_list), key=lambda x: int(x.split("_")[-1]))
36
+ elif contig_list[0].startswith(
37
+ "ERZ"
38
+ ): # e.g. 'ERZ1049444', 'ERZ1049445', 'ERZ1049446'
39
+ contig_list = sorted(
40
+ list(contig_list), key=lambda x: int(x.split("ERZ")[-1])
41
+ )
31
42
  for contig in contig_list:
32
43
  sorted_pos_list = sort_positions(
33
44
  contig, main_gff_extended, ncrnas, trnas, crispr_annotations
@@ -17,27 +17,39 @@
17
17
 
18
18
  import argparse
19
19
 
20
- from gff_annotation_utils import get_ncrnas, get_trnas, load_annotations, load_crispr
21
- from gff_file_utils import write_results_to_file, print_pseudogene_report
20
+ from mgnify_pipelines_toolkit.analysis.assembly.gff_annotation_utils import (
21
+ get_ncrnas,
22
+ get_trnas,
23
+ load_annotations,
24
+ load_crispr,
25
+ )
26
+ from mgnify_pipelines_toolkit.analysis.assembly.gff_file_utils import (
27
+ write_results_to_file,
28
+ print_pseudogene_report,
29
+ )
22
30
 
23
31
 
24
- def main(
25
- gff,
26
- ipr_file,
27
- eggnog_file,
28
- sanntis_file,
29
- crispr_file,
30
- amr_file,
31
- antismash_file,
32
- gecco_file,
33
- dbcan_file,
34
- defense_finder_file,
35
- pseudofinder_file,
36
- rfam_file,
37
- trnascan_file,
38
- outfile,
39
- pseudogene_report_file,
40
- ):
32
+ def main():
33
+
34
+ (
35
+ gff,
36
+ ipr_file,
37
+ eggnog_file,
38
+ sanntis_file,
39
+ crispr_file,
40
+ amr_file,
41
+ antismash_file,
42
+ gecco_file,
43
+ dbcan_file,
44
+ dbcan_cazys_file,
45
+ defense_finder_file,
46
+ pseudofinder_file,
47
+ rfam_file,
48
+ trnascan_file,
49
+ outfile,
50
+ pseudogene_report_file,
51
+ ) = parse_args()
52
+
41
53
  # load annotations and add them to existing CDS
42
54
  # here header contains leading GFF lines starting with "#",
43
55
  # main_gff_extended is a dictionary that contains GFF lines with added in additional annotations
@@ -53,6 +65,7 @@ def main(
53
65
  antismash_file,
54
66
  gecco_file,
55
67
  dbcan_file,
68
+ dbcan_cazys_file,
56
69
  defense_finder_file,
57
70
  pseudofinder_file,
58
71
  )
@@ -66,7 +79,9 @@ def main(
66
79
  if crispr_file:
67
80
  crispr_annotations = load_crispr(crispr_file)
68
81
 
69
- write_results_to_file(outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations)
82
+ write_results_to_file(
83
+ outfile, header, main_gff_extended, fasta, ncrnas, trnas, crispr_annotations
84
+ )
70
85
  if pseudogene_report_file:
71
86
  print_pseudogene_report(pseudogene_report_dict, pseudogene_report_file)
72
87
 
@@ -74,7 +89,7 @@ def main(
74
89
  def parse_args():
75
90
  parser = argparse.ArgumentParser(
76
91
  description="The script extends a user-provided base GFF annotation file by incorporating "
77
- "information extracted from the user-provided outputs of supplementary annotation tools.",
92
+ "information extracted from the user-provided outputs of supplementary annotation tools.",
78
93
  )
79
94
  parser.add_argument(
80
95
  "-g",
@@ -124,7 +139,12 @@ def parse_args():
124
139
  )
125
140
  parser.add_argument(
126
141
  "--dbcan",
127
- help="The GFF file produced by dbCAN post-processing script",
142
+ help="The GFF file produced by dbCAN post-processing script that uses cluster annotations",
143
+ required=False,
144
+ )
145
+ parser.add_argument(
146
+ "--dbcan-cazys",
147
+ help="The GFF file produced by dbCAN-CAZYs post-processing script",
128
148
  required=False,
129
149
  )
130
150
  parser.add_argument(
@@ -146,12 +166,8 @@ def parse_args():
146
166
  "--pseudogene-report", help="Pseudogene report filename", required=False
147
167
  )
148
168
 
149
- return parser.parse_args()
150
-
151
-
152
- if __name__ == '__main__':
153
- args = parse_args()
154
- main(
169
+ args = parser.parse_args()
170
+ return (
155
171
  args.gff_input,
156
172
  args.ips,
157
173
  args.eggnog,
@@ -161,10 +177,15 @@ if __name__ == '__main__':
161
177
  args.antismash,
162
178
  args.gecco,
163
179
  args.dbcan,
180
+ args.dbcan_cazys,
164
181
  args.defense_finder,
165
182
  args.pseudofinder,
166
183
  args.rfam,
167
184
  args.trnascan,
168
185
  args.outfile,
169
186
  args.pseudogene_report,
170
- )
187
+ )
188
+
189
+
190
+ if __name__ == "__main__":
191
+ main()
@@ -0,0 +1,217 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2023-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import argparse
19
+ import fileinput
20
+ import logging
21
+ from pathlib import Path
22
+ import re
23
+
24
+ logging.basicConfig(level=logging.INFO)
25
+
26
+
27
+ def main():
28
+
29
+ args = parse_args()
30
+ hmm_file, overview_file, genome_gff, outfile, dbcan_ver = (
31
+ args.hmm_file,
32
+ args.overview_file,
33
+ args.genome_gff,
34
+ args.outfile,
35
+ args.dbcan_ver,
36
+ )
37
+
38
+ hmm_path = Path(hmm_file)
39
+ overview_path = Path(overview_file)
40
+
41
+ if not hmm_path.is_file():
42
+ raise FileNotFoundError(f"Input hmm path does not exist: {hmm_file}")
43
+
44
+ if not overview_path.is_file():
45
+ raise FileNotFoundError(f"Input overview path does not exist: {overview_file}")
46
+
47
+ substrates = load_substrates(hmm_path)
48
+ genome_gff_lines = load_gff(genome_gff)
49
+
50
+ print_gff(overview_file, outfile, dbcan_ver, substrates, genome_gff_lines)
51
+
52
+
53
+ def load_gff(gff):
54
+ genome_gff_lines = dict()
55
+ with fileinput.hook_compressed(gff, "rt") as gff:
56
+ for line in gff:
57
+ if line.startswith("##FASTA"):
58
+ return genome_gff_lines
59
+
60
+ fields = line.strip().split("\t")
61
+ if len(fields) != 9 or fields[2] != "CDS":
62
+ continue
63
+
64
+ if "Parent=" in line:
65
+ # Get transcript name from the 9th column for mettannotator
66
+ match = re.search(r"Parent=([^;]+)", fields[8])
67
+ elif "ID=" in line:
68
+ # Get transcript name from the 9th column for ASA
69
+ match = re.search(r"ID=([^;]+)", fields[8])
70
+ else:
71
+ logging.error(
72
+ "Not sure what gff annotation delimiter is in use. Exiting"
73
+ )
74
+ exit(1)
75
+
76
+ transcript_name = match.group(1)
77
+ genome_gff_lines.setdefault(transcript_name, []).append(line)
78
+ return genome_gff_lines
79
+
80
+
81
+ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_lines):
82
+ with open(outfile, "w") as file_out:
83
+ file_out.write("##gff-version 3\n")
84
+ with fileinput.hook_compressed(overview_file, "rt") as file_in:
85
+ for line in file_in:
86
+ if line.startswith("MGYG") or line.startswith("ERZ"):
87
+ (
88
+ transcript,
89
+ ec_number_raw,
90
+ dbcan_hmmer,
91
+ dbcan_sub_ecami,
92
+ diamond,
93
+ num_of_tools,
94
+ ) = line.strip().split("\t")
95
+ # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
96
+
97
+ ec_number = ""
98
+ ec_list = ec_number_raw.split("|")
99
+ for ec in ec_list:
100
+ if ec != "-":
101
+ ec_number += ec.split(":")[0] + "|"
102
+
103
+ ec_number = ec_number.strip("|")
104
+
105
+ # Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
106
+ # diamond is messier, so we don't report it here
107
+ if dbcan_hmmer != "-":
108
+ # the field dbcan_hmmer reports match positions in parentheses, clear them out first:
109
+ subfamily = dbcan_hmmer.split("(")[0]
110
+ elif dbcan_sub_ecami != "-":
111
+ subfamily = dbcan_sub_ecami
112
+ else:
113
+ continue
114
+ cleaned_substrates = ",".join(
115
+ sorted(
116
+ {
117
+ subsrate.strip()
118
+ for subsrate in substrates.get(transcript, "N/A").split(
119
+ ","
120
+ )
121
+ }
122
+ )
123
+ )
124
+ # Assemble information to add to the 9th column
125
+ col9_parts = [
126
+ f"protein_family={subfamily}",
127
+ f"substrate_dbcan-sub={cleaned_substrates}",
128
+ ]
129
+
130
+ if ec_number:
131
+ col9_parts.append(f"eC_number={ec_number}")
132
+
133
+ col9_parts.append(f"num_tools={num_of_tools}")
134
+ col9_text = ";".join(col9_parts)
135
+
136
+ for gff_line in genome_gff_lines[transcript]:
137
+ fields = gff_line.strip().split("\t")
138
+ # Replace the tool
139
+ fields[1] = f"dbCAN:{dbcan_version}"
140
+ # Replace the feature
141
+ fields[2] = "CAZyme"
142
+ # Replace the confidence value
143
+ fields[5] = "."
144
+ # Keep only the ID in the 9th column
145
+ attributes = fields[8].split(";")[0]
146
+ # Add dbcan information to the 9th column
147
+ attributes = f"{attributes};{col9_text};"
148
+ fields[8] = attributes
149
+ file_out.write("\t".join(fields) + "\n")
150
+
151
+
152
+ def load_substrates(hmm_path):
153
+ substrates = dict()
154
+ with fileinput.hook_compressed(hmm_path, "rt") as file_in:
155
+ header = next(file_in)
156
+ header_fields = header.strip().split("\t")
157
+ substrate_idx = header_fields.index("Substrate")
158
+ gene_idx = header_fields.index("Gene ID")
159
+ evalue_idx = header_fields.index("E Value")
160
+ for line in file_in:
161
+ fields = line.strip().split("\t")
162
+ if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan
163
+ substrate = fields[substrate_idx]
164
+ if not substrate == "-":
165
+ gene_id = fields[gene_idx]
166
+ substrates.setdefault(gene_id, []).append(substrate)
167
+ # resolve cases with multiple substrates
168
+ for gene_id, substrate_list in substrates.items():
169
+ substrate_list = list(set(substrate_list))
170
+ if len(substrate_list) == 1:
171
+ substrates[gene_id] = substrate_list[0]
172
+ else:
173
+ substrates[gene_id] = ",".join(substrate_list)
174
+ return substrates
175
+
176
+
177
+ def parse_args():
178
+ parser = argparse.ArgumentParser(
179
+ description=(
180
+ "The script takes dbCAN output for a eukaryotic genome and parses it to create a standalone GFF."
181
+ )
182
+ )
183
+ parser.add_argument(
184
+ "-hmm",
185
+ dest="hmm_file",
186
+ required=True,
187
+ help="Path to the hmm file.",
188
+ )
189
+ parser.add_argument(
190
+ "-ov",
191
+ dest="overview_file",
192
+ required=True,
193
+ help="Path to the overview file.",
194
+ )
195
+ parser.add_argument(
196
+ "-g",
197
+ dest="genome_gff",
198
+ required=True,
199
+ help="Path to the genome GFF.",
200
+ )
201
+ parser.add_argument(
202
+ "-o",
203
+ dest="outfile",
204
+ required=True,
205
+ help="Path to the output file.",
206
+ )
207
+ parser.add_argument(
208
+ "-v",
209
+ dest="dbcan_ver",
210
+ required=True,
211
+ help="dbCAN version used.",
212
+ )
213
+ return parser.parse_args()
214
+
215
+
216
+ if __name__ == "__main__":
217
+ main()
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2023-2024 EMBL - European Bioinformatics Institute
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import argparse
18
+ import fileinput
19
+ import logging
20
+ from pathlib import Path
21
+
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+
25
+ def main():
26
+
27
+ args = parse_args()
28
+ standard_file, substrate_file, outfile, dbcan_ver = (
29
+ args.standard_file,
30
+ args.substrate_file,
31
+ args.outfile,
32
+ args.dbcan_ver,
33
+ )
34
+ standard_path = Path(standard_file)
35
+ substrate_path = Path(substrate_file)
36
+
37
+ if not standard_path.exists():
38
+ raise FileNotFoundError(f"Input standards path does not exist: {standard_file}")
39
+
40
+ if not substrate_path.exists():
41
+ raise FileNotFoundError(
42
+ f"Input substrate path does not exist: {substrate_file}"
43
+ )
44
+
45
+ substrates = load_substrates(substrate_path)
46
+ cgc_locations = load_cgcs(standard_path)
47
+ print_gff(standard_path, outfile, dbcan_ver, substrates, cgc_locations)
48
+
49
+
50
+ def load_cgcs(standard_path):
51
+ cgc_locations = dict()
52
+ with fileinput.hook_compressed(standard_path, "rt") as file_in:
53
+ for line in file_in:
54
+ if not line.startswith("CGC#"):
55
+ cgc, _, contig, _, start, end, _, _ = line.strip().split("\t")
56
+ cgc_id = f"{contig}_{cgc}"
57
+ if cgc_id in cgc_locations:
58
+ if cgc_locations[cgc_id]["start"] > int(start):
59
+ cgc_locations[cgc_id]["start"] = int(start)
60
+ if cgc_locations[cgc_id]["end"] < int(end):
61
+ cgc_locations[cgc_id]["end"] = int(end)
62
+ else:
63
+ cgc_locations[cgc_id] = {
64
+ "start": int(start),
65
+ "end": int(end),
66
+ "contig": contig,
67
+ }
68
+ return cgc_locations
69
+
70
+
71
+ def print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations):
72
+ with open(outfile, "w") as file_out:
73
+ file_out.write("##gff-version 3\n")
74
+ cgcs_printed = list()
75
+ with fileinput.hook_compressed(standard_path, "rt") as file_in:
76
+ for line in file_in:
77
+ if not line.startswith("CGC#"):
78
+ cgc, gene_type, contig, prot_id, start, end, strand, protein_fam = (
79
+ line.strip().split("\t")
80
+ )
81
+ cgc_id = f"{contig}_{cgc}"
82
+ protein_fam = protein_fam.replace(" ", "")
83
+ if cgc_id not in cgcs_printed:
84
+ substrate = (
85
+ substrates[cgc_id]
86
+ if cgc_id in substrates
87
+ else "substrate_dbcan-pul=N/A;substrate_dbcan-sub=N/A"
88
+ )
89
+ file_out.write(
90
+ "{}\tdbCAN:{}\tpredicted PUL\t{}\t{}\t.\t.\t.\tID={};{}\n".format(
91
+ contig,
92
+ dbcan_version,
93
+ cgc_locations[cgc_id]["start"],
94
+ cgc_locations[cgc_id]["end"],
95
+ cgc_id,
96
+ substrate,
97
+ )
98
+ )
99
+ cgcs_printed.append(cgc_id)
100
+ file_out.write(
101
+ (
102
+ f"{contig}\tdbCAN:{dbcan_version}\t{gene_type}\t{start}"
103
+ + f"\t{end}\t.\t{strand}\t.\tID={prot_id};Parent={cgc_id};protein_family={protein_fam}\n"
104
+ )
105
+ )
106
+
107
+
108
+ def load_substrates(substrate_path):
109
+ substrates = dict()
110
+ with fileinput.hook_compressed(substrate_path, "rt") as file_in:
111
+ for line in file_in:
112
+ if not line.startswith("#"):
113
+ parts = line.strip().split("\t")
114
+ cgc_parts = parts[0].rsplit("|", 1)
115
+ cgc = "_".join(cgc_parts)
116
+ try:
117
+ substrate_pul = parts[2]
118
+ except IndexError:
119
+ substrate_pul = "N/A"
120
+ try:
121
+ substrate_ecami = parts[5]
122
+ except IndexError:
123
+ substrate_ecami = "N/A"
124
+ if not substrate_pul:
125
+ substrate_pul = "N/A"
126
+ if not substrate_ecami:
127
+ substrate_ecami = "N/A"
128
+ substrates[cgc] = (
129
+ f"substrate_dbcan-pul={substrate_pul};substrate_dbcan-sub={substrate_ecami}"
130
+ )
131
+
132
+ return substrates
133
+
134
+
135
+ def parse_args():
136
+ parser = argparse.ArgumentParser(
137
+ description=(
138
+ "The script takes dbCAN output and parses it to create a standalone GFF."
139
+ )
140
+ )
141
+ parser.add_argument(
142
+ "-st",
143
+ dest="standard_file",
144
+ required=True,
145
+ help="Path to the standard file (*cgc_standard.out)",
146
+ )
147
+ parser.add_argument(
148
+ "-sb",
149
+ dest="substrate_file",
150
+ required=True,
151
+ help="Path to the substrate file (*substrate.out)",
152
+ )
153
+ parser.add_argument(
154
+ "-o",
155
+ dest="outfile",
156
+ required=True,
157
+ help="Path to the output file.",
158
+ )
159
+ parser.add_argument(
160
+ "-v",
161
+ dest="dbcan_ver",
162
+ required=True,
163
+ help="dbCAN version used.",
164
+ )
165
+ return parser.parse_args()
166
+
167
+
168
+ if __name__ == "__main__":
169
+ main()
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import fileinput
19
+ import logging
20
+ import pandas as pd
21
+
22
+
23
+ logging.basicConfig(
24
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
25
+ )
26
+
27
+ ANTISMASH_VERSION = "7.1.x"
28
+
29
+ f"""
30
+ Script parses antismash GFF output and adds descriptions from pre-parsed glossary https://docs.antismash.secondarymetabolites.org/glossary/.
31
+ Glossary was taken from version {ANTISMASH_VERSION} and commit dbeeb0e https://github.com/antismash/documentation/blob/master/docs/glossary.md
32
+ """
33
+
34
+ DESCRIPTIONS = {
35
+ "2dos": "2-deoxy-streptamine aminoglycoside",
36
+ "acyl_amino_acids": "N-acyl amino acid",
37
+ "amglyccycl": "Aminoglycoside/aminocyclitol",
38
+ "aminocoumarin": "Aminocoumarin",
39
+ "aminopolycarboxylic-acid": "Aminopolycarboxylic acid metallophores (doi:10.1039/C8MT00009C)",
40
+ "archaeal-ripp": "Archaeal RiPPs (doi:10.1021/jacs.2c00521 supplemental)",
41
+ "arylpolyene": "Aryl polyene",
42
+ "atropopeptide": "Atropopeptide RiPPs, e.g. scabrirubin and tryptorubin",
43
+ "azoxy-crosslink": "axoxy compounds formed by carboxilic cross-link",
44
+ "azoxy-dimer": "axoxy compounds formed by dimerisation",
45
+ "benzoxazole": "Benzoxazoles",
46
+ "betalactone": "Beta-lactone containing protease inhibitor",
47
+ "blactam": "&beta;-lactam",
48
+ "bottromycin": "Bottromycin",
49
+ "butyrolactone": "Butyrolactone",
50
+ "cdps": "tRNA-dependent cyclodipeptide synthases",
51
+ "crocagin": "Crocagin-like",
52
+ "cyanobactin": "Cyanobactins like patellamide (AY986476)",
53
+ "cyclic-lactone-autoinducer": "agrD-like cyclic lactone autoinducer peptides (AF001782)",
54
+ "cytokinin": "Adenine-type cytokinins, e.g. fusatin and trans-zeatin",
55
+ "darobactin": "Darobactin-like compounds",
56
+ "deazapurine": "Deazapurine",
57
+ "ectoine": "Ectoine",
58
+ "epipeptide": "D-amino-acid containing RiPPs such as yydF (D78193)",
59
+ "fungal_cdps": "Fungal cyclodipeptide synthases",
60
+ "fungal-ripp": "Fungal RiPP with POP or UstH peptidase types and a modification",
61
+ "furan": "Furan",
62
+ "glycocin": "Glycocin",
63
+ "guanidinotides": "Pheganomycin-style protein ligase-containing cluster",
64
+ "hgle-ks": "Heterocyst glycolipid synthase-like PKS",
65
+ "hr-t2pks": "Highly reducing type II PKS like ishigamide and skyllamycin",
66
+ "hserlactone": "Homoserine lactone",
67
+ "hydrogen-cyanide": "Hydrogen cyanide (AF208523, doi:10.1128/jb.182.24.6940-6949.20)",
68
+ "hydroxy-tropolone": "7-hydroxytropolone-like cluster",
69
+ "indole": "Indole",
70
+ "isocyanide": "Isocyanides (doi:10.1093/nar/gkad573)",
71
+ "nrp with isocyanide": "Isocyanides (doi:0.1128/mBio.00785-18)",
72
+ "ladderane": "Ladderane",
73
+ "lanthipeptide class i": "Class I lanthipeptides like nisin",
74
+ "lanthipeptide class ii": "Class II lanthipeptides like mutacin II (U40620)",
75
+ "lanthipeptide class iii": "Class III lanthipeptides like labyrinthopeptin (FN178622)",
76
+ "lanthipeptide class iv": "Class IV lanthipeptides like venezuelin (HQ328852)",
77
+ "lanthipeptide class v": "Glycosylated lanthipeptide/linaridin hybrids like MT210103",
78
+ "lassopeptide": "Lasso peptide",
79
+ "leupeptin": "leupeptin-like compounds",
80
+ "linaridin": "Linear arid peptide such as cypemycin (HQ148718) and salinipeptin (MG788286)",
81
+ "lincosamides": "NRPS-adjacent biosynthesis of lincosamides",
82
+ "lipolanthine": "Lanthipeptide class containing N-terminal fatty acids such as MG673929",
83
+ "melanin": "Melanin",
84
+ "methanobactin": "Copper-chelating/transporting peptides (doi:10.1126/science.aap9437)",
85
+ "microviridin": "Microviridin",
86
+ "mycosporine": "Molecules containing mycosporine-like amino acid",
87
+ "naggn": "N-acetylglutaminylglutamine amide",
88
+ "napaa": "Non-alpha poly-amino acids like e-Polylysin",
89
+ "ni-siderophore": "NRPS-independent, IucA/IucC-like siderophores (*siderophore* prior to 7.0)",
90
+ "nitropropanoic-acid": "3-Nitropropanoic acid (neurotoxin)",
91
+ "nrps": "Non-ribosomal peptide synthetase",
92
+ "nrp-metallophore": "Non-ribosomal peptide metallophores",
93
+ "nucleoside": "Nucleoside",
94
+ "oligosaccharide": "Oligosaccharide",
95
+ "opine-like-metallophore": "Opine-like zincophores like staphylopine (doi:10.1128/mSystems.00554-20)",
96
+ "other": "Cluster containing a secondary metabolite-related protein that does not fit into any other category",
97
+ "pbde": "Polybrominated diphenyl ether",
98
+ "phenazine": "Phenazine",
99
+ "phosphoglycolipid": "Phosphoglycolipid",
100
+ "phosphonate": "Phosphonate",
101
+ "polyhalogenated-pyrrole": "Polyhalogenated pyrrole",
102
+ "polyyne": "Polyyne",
103
+ "ppys-ks": "PPY-like pyrone",
104
+ "prodigiosin": "Serratia-type non-traditional PKS prodigiosin biosynthesis pathway",
105
+ "proteusin": "Proteusin",
106
+ "pufa": "Polyunsaturated fatty acid",
107
+ "pyrrolidine": "Pyrrolidines like described in BGC0001510",
108
+ "ranthipeptide": "Cys-rich peptides (aka. SCIFF: six Cys in fourty-five) like in CP001581:3481278-3502939",
109
+ "ras-ripp": "Streptide-like thioether-bond RiPPs",
110
+ "rcdps": "Fungal Arginine-containing cyclic dipeptides",
111
+ "redox-cofactor": "Redox-cofactors such as PQQ (NC_021985:1458906-1494876)",
112
+ "resorcinol": "Resorcinol",
113
+ "sactipeptide": "Sactipeptide",
114
+ "spliceotide": "RiPPs containing plpX type spliceases (NZ_KB235920:17899-42115)",
115
+ "t1pks": "Type I PKS (Polyketide synthase)",
116
+ "t2pks": "Type II PKS",
117
+ "t3pks": "Type III PKS",
118
+ "terpene": "Terpene",
119
+ "thioamitides": "Thioamitide RiPPs as found in JOBF01000011",
120
+ "thioamide-nrp": "Thioamide-containing non-ribosomal peptide",
121
+ "transat-pks": "Trans-AT PKS",
122
+ "triceptide": "Triceptides",
123
+ "tropodithietic-acid": "Tropodithietic acid",
124
+ "fungal-ripp-like": "Fungal RiPP-likes",
125
+ "nrps-like": "NRPS-like fragment",
126
+ "phosphonate-like": "Phosphonate-like (prior to 7.0 this was the phosphonate rule)",
127
+ "pks-like": "Other types of PKS",
128
+ "ripp-like": "Other unspecified ribosomally synthesised and post-translationally modified peptide product (RiPP)",
129
+ "rre-containing": "RRE-element containing cluster",
130
+ "terpene-precursor": "Compound likely used as a terpene precursor",
131
+ "transat-pks-like": "Trans-AT PKS fragment, with trans-AT domain not found",
132
+ "fatty_acid": "Fatty acid (loose strictness, likely from primary metabolism)",
133
+ "halogenated": "Halogenase-containing cluster, potentially generating a halogenated product",
134
+ "lysine": "Fungal lysine primary metabolism",
135
+ "saccharide": "Saccharide (loose strictness, likely from primary metabolism)",
136
+ "lap": "Linear azol(in)e-containing peptides",
137
+ "mycosporine-like": "Molecules containing mycosporine-like amino acid",
138
+ "thiopeptide": "Thiopeptide",
139
+ "siderophore": "Siderophore",
140
+ "bacteriocin": "Bacteriocin or other unspecified ribosomally synthesised and post-translationally modified peptide product (RiPP)",
141
+ "fused": "Pheganomycin-style protein ligase-containing cluster",
142
+ "head_to_tail": "Head-to-tail cyclised RiPP (subtilosin-like)",
143
+ "lanthidin": "Glycosylated lanthipeptide/linaridin hybrids like MT210103",
144
+ "lanthipeptide": "Lanthipeptides",
145
+ "tfua-related": "TfuA-related RiPPs",
146
+ "otherks": "Other types of PKS",
147
+ "microcin": "Microcin",
148
+ "cf_saccharide": "Possible saccharide",
149
+ "cf_fatty_acid": "Possible fatty acid",
150
+ "cf_putative": "Putative cluster of unknown type identified with the ClusterFinder algorithm",
151
+ }
152
+
153
+
154
+ def parse_args():
155
+ description = (
156
+ "antiSMASH output summary generator. "
157
+ "Script takes regions from GFF and counts its appearance in annotation. "
158
+ "Output columns contain classID, descriptions and count. "
159
+ f"Descriptions were taken from pre-parsed glossary provided on antiSMASH website. "
160
+ f"Current script supports antiSMASH results for version {ANTISMASH_VERSION} and older."
161
+ )
162
+ parser = argparse.ArgumentParser(description=description)
163
+ parser.add_argument("-i", "--antismash-gff", help="antiSMASH GFF", required=True)
164
+ parser.add_argument(
165
+ "-o", "--output", help="Antisamsh summary TSV output file.", required=True
166
+ )
167
+ parser.add_argument(
168
+ "-a",
169
+ "--antismash-version",
170
+ help="antiSMASH version that was used to generate GFF",
171
+ required=False,
172
+ default=ANTISMASH_VERSION,
173
+ )
174
+ args = parser.parse_args()
175
+ if args.antismash_version > ANTISMASH_VERSION:
176
+ logging.error(
177
+ "Provided version of antiSMASH is bigger than supported. "
178
+ "Please, make sure you have updated descriptions dictionary. Exit."
179
+ )
180
+ exit(1)
181
+ return args.antismash_gff, args.output
182
+
183
+
184
+ def main():
185
+ input_gff, output_filename = parse_args()
186
+ dict_list = []
187
+ with fileinput.hook_compressed(input_gff, "r") as file_in:
188
+ # TODO: to be merged with the GFF toolkit
189
+ for line in file_in:
190
+ if line.startswith("#"):
191
+ continue
192
+ info = line.strip().split("\t")[8].split(";")
193
+ entry_dict = {}
194
+ for pair in info:
195
+ key, value = pair.split(
196
+ "=", 1
197
+ ) # Ensure split only occurs at the first '=' occurrence
198
+ entry_dict[key] = value
199
+ dict_list.append(entry_dict)
200
+
201
+ # Convert to DataFrame
202
+ df = pd.DataFrame(dict_list)
203
+ df = df[df["product"].notna()]
204
+ df_grouped = (
205
+ df.groupby(["product"]).size().reset_index(name="Count")
206
+ ).sort_values(by="Count", ascending=False)
207
+
208
+ df_grouped = df_grouped.rename(
209
+ columns={
210
+ "product": "label",
211
+ }
212
+ )
213
+ df_grouped["Description"] = df_grouped["label"].apply(
214
+ lambda x: ",".join(
215
+ [
216
+ DESCRIPTIONS.get(cls.strip().lower(), cls.strip())
217
+ for cls in x.split(",")
218
+ ]
219
+ )
220
+ )
221
+ df_grouped = df_grouped[["label", "Description", "Count"]]
222
+ df_grouped = df_grouped.rename(columns={
223
+ "Description": "description",
224
+ "Count": "count"
225
+ })
226
+ df_grouped.to_csv(output_filename, sep="\t", index=False)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -0,0 +1,119 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import fileinput
19
+ import logging
20
+ import pandas as pd
21
+
22
+ logging.basicConfig(
23
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
24
+ )
25
+
26
+ SANNTIS_VERSION = "0.9.4.1"
27
+
28
+ f"""
29
+ Script parses SanntiS GFF output and adds descriptions of annotated MIBiGs classes.
30
+ Descriptions were pre-parsed for version {SANNTIS_VERSION} and stored as a dictionary.
31
+ """
32
+
33
+ DESCRIPTIONS = {
34
+ "Polyketide": "Built from iterative condensation of acetate units derived from acetyl-CoA",
35
+ "Terpene": "Composed of isoprene (C5) units derived from isopentenyl pyrophosphate",
36
+ "Alkaloid": "Nitrogen-containing compounds derived from amino acids (e.g., ornithine, lysine, tyrosine, tryptophan)",
37
+ "RiPP": "Ribosomally synthesised and Post-translationally modified Peptide",
38
+ "NRP": "Nonribosomal Peptide",
39
+ "Saccharide": "Carbohydrate-based natural products (e.g., aminoglycoside antibiotics)",
40
+ "Other": "Catch-all class for clusters encoding metabolites outside main classes (e.g., cyclitols, indolocarbazoles, and phosphonates)",
41
+ }
42
+
43
+
44
+ def parse_args():
45
+ description = (
46
+ "Sanntis output summary generator. "
47
+ "Script takes SanntiS GFF and counts pairs of (nearest_MiBIG, nearest_MiBIG_class)."
48
+ "It also adds pre-parsed descriptions of classes stored in that script as a dictionary. "
49
+ f"Descriptions were taken from SanntiS docs v{SANNTIS_VERSION}."
50
+ )
51
+ parser = argparse.ArgumentParser(description=description)
52
+ parser.add_argument("-i", "--sanntis-gff", help="SanntiS GFF", required=True)
53
+ parser.add_argument(
54
+ "-o", "--output", help="SanntiS summary TSV output file.", required=True
55
+ )
56
+ args = parser.parse_args()
57
+ return args.sanntis_gff, args.output
58
+
59
+
60
+ def main():
61
+ input_gff, output_filename = parse_args()
62
+ dict_list = []
63
+ with fileinput.hook_compressed(input_gff, "r") as file_in:
64
+ # TODO: to be merged with the GFF toolkit
65
+ for line in file_in:
66
+ if line.startswith("#"):
67
+ continue
68
+ info = line.strip().split("\t")[8].split(";")
69
+ entry_dict = {}
70
+ # TODO: merge this with the GFF toolkit GFF reader
71
+ for pair in info:
72
+ key, value = pair.split(
73
+ "=", 1
74
+ ) # Ensure split only occurs at the first '=' occurrence
75
+ entry_dict[key] = value
76
+ dict_list.append(entry_dict)
77
+
78
+ # Convert to DataFrame
79
+ df = pd.DataFrame(dict_list)
80
+ df = df.rename(
81
+ columns={
82
+ "nearest_MiBIG": "nearest_MIBiG",
83
+ "nearest_MiBIG_class": "nearest_MIBiG_class",
84
+ }
85
+ )
86
+ df_grouped = (
87
+ df.groupby(["nearest_MIBiG", "nearest_MIBiG_class"])
88
+ .size()
89
+ .reset_index(name="Count")
90
+ )
91
+ df_grouped = df_grouped.sort_values(by="Count", ascending=False)
92
+
93
+ df_desc = pd.DataFrame(
94
+ list(DESCRIPTIONS.items()), columns=["MIBiG_class", "Description"]
95
+ )
96
+ df_desc = df_desc.set_index("MIBiG_class")
97
+ df_merged = df_grouped.merge(
98
+ df_desc, left_on="nearest_MIBiG_class", right_index=True, how="left"
99
+ )
100
+ df_merged["Description"] = df_merged.apply(
101
+ lambda row: row["nearest_MIBiG_class"].replace(
102
+ "NRP", df_desc.loc["NRP"]["Description"]
103
+ )
104
+ if pd.isna(row["Description"]) and "NRP" in row["nearest_MIBiG_class"]
105
+ else row["Description"],
106
+ axis=1,
107
+ )
108
+ df_merged = df_merged[
109
+ ["nearest_MIBiG", "nearest_MIBiG_class", "Description", "Count"]
110
+ ]
111
+ df_merged = df_merged.rename(columns={
112
+ "Description": "description",
113
+ "Count": "count"
114
+ })
115
+ df_merged.to_csv(output_filename, sep="\t", index=False)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
@@ -62,9 +62,12 @@ class TableModifier:
62
62
  self.output_file = output_file
63
63
 
64
64
  def modify_table(self):
65
- with fileinput.hook_compressed(self.input_file, "rt") as file_in, open(
66
- self.output_file, "w"
67
- ) as file_out:
65
+ with (
66
+ fileinput.hook_compressed(
67
+ self.input_file, "r", encoding="utf-8"
68
+ ) as file_in,
69
+ open(self.output_file, "w") as file_out,
70
+ ):
68
71
  header_written = False
69
72
  separator_line, header = "", ""
70
73
  for line in file_in:
@@ -26,9 +26,9 @@ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
26
26
  # used by library_strategy_checker in analysis.shared
27
27
  MIN_AMPLICON_STRATEGY_CHECK = 0.30
28
28
 
29
+
29
30
  # used by markergene_study_summary in analysis.shared
30
31
  MAJORITY_MARKER_PROPORTION = 0.45
31
-
32
32
  # used by gff_toolkit in analysis.assembly
33
33
  EVALUE_CUTOFF_IPS = 1e-10
34
34
  EVALUE_CUTOFF_EGGNOG = 1e-10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.0.4
3
+ Version: 1.0.6
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -16,15 +16,19 @@ mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=N
16
16
  mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=wXrw1B-z4hOu5oA27Vp1WYxGP2Mk6ZY4i_T5jDZgek0,6954
17
17
  mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
18
18
  mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=2T4T7aXMGPac-LZUXJF3lOUzZZF50dAKkKTSaO-4idQ,3587
19
- mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=IlkeP4DuN7rXJIHa7o2sONHAXLhV9nGP-5Y1_0u8YQo,31393
20
- mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=8kv_6KWznOVRkeAtghLf4pxKPhAqdn36LOK4MsTz9hU,3282
21
- mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=uUIo97gmzO2zzN-pYF5paIzeHWBsmmjFp7zGAhf4PKY,5021
19
+ mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=6gbCRlEX1eBqzFYjOt3og-961dZ--QsCJL-7l5nzg1k,33992
20
+ mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=_4J31wAjK5B15p9lDzTG2wmZdyoZkOgmy7Kp_w8lTeE,3812
21
+ mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=_iaTBvMKbQDi_02_QuSPqLJ_rC37ruxiPHv5lLQmI-w,5480
22
22
  mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4SHHjN89k-M9i_cFMc2lI_ZFxqY,5596
23
23
  mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
24
+ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=5lbVSWRZoi61cKvuolzUJlhUBzpx8DgWMH0Vzw1HcHA,7748
25
+ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=i4uYdqY6y2ee72vl0sLkHeJvigHGKJMzdyR3HEIK1Mk,5930
26
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=eRAQ0vFbqnWreiBdtFuwLKve9WwYwv9dYQtD1pumaZs,10776
24
27
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
28
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=65szj-H8Hxy_eXy3TyTs48EhPJbJ2w1skHlVbH2YeVM,4538
25
29
  mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
30
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=0Ot1j4LPsEPyPbySSAh6n9s5Dilm_8_M9YQvTnQ-1PQ,4415
31
+ mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
28
32
  mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=hggPqv9QawWAccm5tmru4VF9VnQAHF5LCXnqyLw_BWI,6727
29
33
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
30
34
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=UrU0CpZj3pfHZWI7Uuhv2a_C0JsO8pnVErY0sWGgNdw,4920
@@ -38,15 +42,15 @@ mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgP
38
42
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=7nEOODQq35y9wx9YnvJuo29oBpwTpXg_kIbf_t7N4TQ,1093
39
43
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
40
44
  mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
41
- mgnify_pipelines_toolkit/constants/thresholds.py,sha256=guDE7c4KrVJEfg_AcO_cQoJM6LGGaRlmo_U2i8d4N7g,1157
45
+ mgnify_pipelines_toolkit/constants/thresholds.py,sha256=V_xDBk0RhS3hHeWqOacKzth2gM6zJABRPgwHy-Ciqfk,1157
42
46
  mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
43
47
  mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pnH8LUH8i2ACNvFNWyG-n-eIHZcI5O9UDYulkh43mec,7692
44
48
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
49
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
46
50
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
47
- mgnify_pipelines_toolkit-1.0.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
- mgnify_pipelines_toolkit-1.0.4.dist-info/METADATA,sha256=Coky89dC0Xh5wHLk7fPGEOk_-fXY3GvvMMtb2dz5krc,5810
49
- mgnify_pipelines_toolkit-1.0.4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
50
- mgnify_pipelines_toolkit-1.0.4.dist-info/entry_points.txt,sha256=QZ6vY4w3lYG8Xmll_s9SIsOpkxa5gBVEIxU3GvoCF4I,2946
51
- mgnify_pipelines_toolkit-1.0.4.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
52
- mgnify_pipelines_toolkit-1.0.4.dist-info/RECORD,,
51
+ mgnify_pipelines_toolkit-1.0.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
52
+ mgnify_pipelines_toolkit-1.0.6.dist-info/METADATA,sha256=b9Hoo0e0xVvL4erImJgt_7gtbb-5Yx8TZNlf9KZcQIY,5810
53
+ mgnify_pipelines_toolkit-1.0.6.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
54
+ mgnify_pipelines_toolkit-1.0.6.dist-info/entry_points.txt,sha256=T8soGT2to8c_qafw-0itqCn4sjOnxlfaNWHIaHz4H54,3416
55
+ mgnify_pipelines_toolkit-1.0.6.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
56
+ mgnify_pipelines_toolkit-1.0.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -18,6 +18,7 @@ genomes_extract_trnas = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_tr
18
18
  get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
19
19
  get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
20
20
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
21
+ gff_toolkit = mgnify_pipelines_toolkit.analysis.assembly.gff_toolkit:main
21
22
  krona_txt_from_cat_classification = mgnify_pipelines_toolkit.analysis.assembly.krona_txt_from_cat_classification:main
22
23
  library_strategy_check = mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main
23
24
  make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
@@ -25,8 +26,12 @@ mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main
25
26
  mapseq_to_asv_table = mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main
26
27
  markergene_study_summary = mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main
27
28
  primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
29
+ process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
30
+ process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
28
31
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
29
32
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
30
33
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
31
34
  study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli
35
+ summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
32
36
  summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
37
+ summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main